Dear libav-users,

I wrote a program which extracts motion vectors from a video stream and encountered the problem that the runtime of sws_scale changes depending on whether I put a sleep command in the main loop of the caller. If no sleep command is present, sws_scale returns after ca. 0.9 ms on my machine. With a sleep command of arbitrary length (I tested 1ms, 25ms, 500ms and 1s) the runtime is around 7 ms.

I am using libswscale as shipped in FFMPEG 4.1 and my implementation is similar to the code used in OpenCV VideoCapture (https://github.com/opencv/opencv/blob/master/modules/videoio/src/cap_ffmpeg_impl.hpp#L431)

I would be glad if someone could provide me with at least an idea of what is going wrong here. My code is attached below.

Best regards,

Lukas


// Compile command: g++ -I ~/boost -I /usr/include/python3.6m/ -fpic video_cap.cpp -o main -L ~/boost/stage/lib -lboost_python36 -lboost_numpy36 -lpython3.6m `pkg-config --cflags --libs libavformat libswscale opencv4` -Wl,-Bsymbolic

#include <thread>
#include <iostream>
#include <vector>
#include <chrono>


#include <opencv2/opencv.hpp>
#include <opencv2/core/types.hpp>
#include <opencv2/imgproc.hpp>

// FFMPEG
extern "C" {
#include <libavutil/motion_vector.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavutil/imgutils.h>
}

// for changing the dtype of motion vector
#define MVS_DTYPE int32_t
#define MVS_DTYPE_NP NPY_INT32


struct Image_FFMPEG
{
    unsigned char* data;
    int step;
    int width;
    int height;
    int cn;
};


class VideoCap {

private:

    const char *url;
    AVDictionary *opts;
    AVCodec *codec;
    AVFormatContext *fmt_ctx;
    AVCodecContext *video_dec_ctx;
    AVStream *video_stream;
    int video_stream_idx;
    AVFrame *frame;
    AVFrame rgb_frame;
    Image_FFMPEG picture;
    struct SwsContext *img_convert_ctx;


public:

    VideoCap() {
        this->opts = NULL;
        this->codec = NULL;
        this->fmt_ctx = NULL;
        this->video_dec_ctx = NULL;
        this->video_stream = NULL;
        this->video_stream_idx = -1;
        this->frame = NULL;
        this->img_convert_ctx = NULL;

        memset(&(this->rgb_frame), 0, sizeof(this->rgb_frame));
        memset(&(this->picture), 0, sizeof(this->picture));
    }


    void free_all() {
        if (this->img_convert_ctx) {
            sws_freeContext(this->img_convert_ctx);
            this->img_convert_ctx = 0;
        }

        if (this->frame)
            av_frame_free(&(this->frame));

        av_frame_unref(&(this->rgb_frame));

        if (this->video_dec_ctx)
            avcodec_free_context(&(this->video_dec_ctx));

        if (this->fmt_ctx)
            avformat_close_input(&(this->fmt_ctx));
    }


    void release(void) {
        this->free_all();
    }


    bool open(const char *url) {
        this->url = url;
        int ret;

        // open RTSP stream with TCP
        av_dict_set(&(this->opts), "rtsp_transport", "tcp", 0);
        ret = avformat_open_input(&(this->fmt_ctx), url, NULL, &(this->opts));
        if (ret < 0) {
            std::cerr << "Could not open source file ' " << url << "'" << std::endl;
            return false;
        }

        // read packets of a media file to get stream information.
        ret = avformat_find_stream_info(this->fmt_ctx, NULL);
        if (ret < 0) {
            std::cerr << "Could not find stream information" << std::endl;
            return false;
        }

        ret = this->open_codec_context(this->fmt_ctx, AVMEDIA_TYPE_VIDEO);
        if (!ret) {
            std::cerr << "Could not create codex context" << std::endl;
            return false;
        }

        // print info (duration, bitrate, streams, container, programs, metadata, side data, codec, time base)
        av_dump_format(this->fmt_ctx, 0, url, 0);

        if (!this->video_stream) {
            std::cerr << "Could not find video stream in the input, aborting" << std::endl;
            this->free_all();
            return false;
        }

        this->frame = av_frame_alloc();
        if (!this->frame) {
            std::cerr << "Could not allocate frame" << std::endl;
            this->free_all();
            return false;
        }

        return true;
    }


    bool open_codec_context(AVFormatContext *fmt_ctx, enum AVMediaType type) {         // find the most suitable stream of given type (e.g. video) and set the codec accordingly         int ret = av_find_best_stream(fmt_ctx, type, -1, -1, &(this->codec), 0);
        if (ret < 0) {
            std::cerr << "Could not find " << av_get_media_type_string(type) << " stream in input file '" << this->url << "'" << std::endl;
            return false;
        }
        else {
            // set stream in format context
            this->video_stream_idx = ret;
            AVStream *st = fmt_ctx->streams[this->video_stream_idx];

            // allocate an AVCodecContext and set its fields to default values
            this->video_dec_ctx = avcodec_alloc_context3(this->codec);
            if (!this->video_dec_ctx) {
                std::cerr << "Failed to allocate codec" << std::endl;
                return false;
            }

            // fill the codec context based on the values from the supplied codec parameters             ret = avcodec_parameters_to_context(this->video_dec_ctx, st->codecpar);
            if (ret < 0) {
                std::cerr << "Failed to copy codec parameters to codec context" << std::endl;
                return false;
            }

            this->video_dec_ctx->thread_count = std::thread::hardware_concurrency();             std::cerr << "Using parallel processing with " << this->video_dec_ctx->thread_count << " threads" << std::endl;

            // backup encoder's width/height
            int enc_width = this->video_dec_ctx->width;
            int enc_height = this->video_dec_ctx->height;

            // Init the video decoder with the codec and set additional option to extract motion vectors
            av_dict_set(&(this->opts), "flags2", "+export_mvs", 0);
            ret = avcodec_open2(this->video_dec_ctx, this->codec, &(this->opts));
            if (ret < 0) {
                std::cerr << "Failed to open " << av_get_media_type_string(type) << " codec" << std::endl;
                return false;
            }

            this->video_stream = fmt_ctx->streams[this->video_stream_idx];

            // checking width/height (since decoder can sometimes alter it, eg. vp6f)
            if (enc_width && (this->video_dec_ctx->width != enc_width)) {
                this->video_dec_ctx->width = enc_width;
            }
            if (enc_height && (this->video_dec_ctx->height != enc_height)) {
                this->video_dec_ctx->height = enc_height;
            }

            this->picture.width = this->video_dec_ctx->width;
            this->picture.height = this->video_dec_ctx->height;
            this->picture.cn = 3;
            this->picture.step = 0;
            this->picture.data = NULL;
        }

        return true;
    }


    bool read(cv::OutputArray cv_frame, char *frame_type, MVS_DTYPE **motion_vectors, MVS_DTYPE *num_mvs) {

        uint8_t* data = 0;
        int step = 0, width = 0, height = 0, cn = 0;

        // loop over different streams (video, audio) in the file
        while(1) {
            AVPacket pkt = { 0 };

            // read next packet from the stream
            int ret = av_read_frame(this->fmt_ctx, &pkt);
            if (ret < 0) {
                return false;
            }

            // if the packet is not from the video stream don't do anything and get next packet
            if (pkt.stream_index != this->video_stream_idx) {
                continue;
            }
            // if the packet is from the video stream send it to decoder
            else {

                bool ret = this->decode_packet(&pkt, &data, &step, &width, &height, &cn, frame_type, motion_vectors, num_mvs);
                if (!ret) {
                    return false;
                }

                cv::Mat(height, width, CV_MAKETYPE(CV_8U, cn), data, step).copyTo(cv_frame);

                av_packet_unref(&pkt);

                return true;
            }
        }
    }


    bool frame_to_buffer(uint8_t** data, int* step, int* width, int* height, int* cn)
    {
        if (!this->video_stream || !(this->frame->data[0])) {
            return false;
        }

        if (this->img_convert_ctx == NULL ||
            this->picture.width != this->video_dec_ctx->width ||
            this->picture.height != this->video_dec_ctx->height ||
            this->picture.data == NULL) {

            // Some sws_scale optimizations have some assumptions about alignment of data/step/width/height             // Also we use coded_width/height to workaround problem with legacy ffmpeg versions (like n0.8)
            int buffer_width = this->video_dec_ctx->coded_width;
            int buffer_height = this->video_dec_ctx->coded_height;

            this->img_convert_ctx = sws_getCachedContext(
                    this->img_convert_ctx,
                    buffer_width, buffer_height,
                    this->video_dec_ctx->pix_fmt,
                    buffer_width, buffer_height,
                    AV_PIX_FMT_BGR24,
                    SWS_BICUBIC,
                    NULL, NULL, NULL
                    );

            if (this->img_convert_ctx == NULL) {
                std::cerr << "Allocation of conversion context failed" << std::endl;
                return false;
            }

            av_frame_unref(&(this->rgb_frame));
            this->rgb_frame.format = AV_PIX_FMT_BGR24;
            this->rgb_frame.width = buffer_width;
            this->rgb_frame.height = buffer_height;
            if (0 != av_frame_get_buffer(&(this->rgb_frame), 32)) {
                std::cerr << "Not enough memory to allocate buffer for frame conversion" << std::endl;
                return false;
            }

            this->picture.width = this->video_dec_ctx->width;
            this->picture.height = this->video_dec_ctx->height;
            this->picture.cn = 3;
            this->picture.data = this->rgb_frame.data[0];
            this->picture.step = this->rgb_frame.linesize[0];
        }

        auto start = std::chrono::high_resolution_clock::now();
        sws_scale(
            this->img_convert_ctx,
            this->frame->data,
            this->frame->linesize,
            0, this->video_dec_ctx->coded_height,
            this->rgb_frame.data,
            this->rgb_frame.linesize
            );
        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;
        std::cout << "sws_scale (C++): " << elapsed.count() << " s\n";

        *data = this->picture.data;
        *step = this->picture.step;
        *width = this->picture.width;
        *height = this->picture.height;
        *cn = this->picture.cn;

        return true;
    }


    bool decode_packet(const AVPacket *pkt, uint8_t** data, int* step, int* width, int* height, int* cn, char *frame_type, MVS_DTYPE **motion_vectors, MVS_DTYPE *num_mvs) {
        // send encoded data packet to the decoder
        int ret = avcodec_send_packet(this->video_dec_ctx, pkt);
        if (ret < 0) {
            std::cerr << "Error while sending a packet to the decoder: " << ret << std::endl;
            return false;
        }

        // loop over packets until the next frame is fully assembled
        while (ret >= 0)  {
            // try to get the next frame from decoder
            ret = avcodec_receive_frame(this->video_dec_ctx, this->frame);
            // failed: end of stream or no frame available, stop and return with success
            if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                break;
            }
            // failed: another error occured, return with error
            else if (ret < 0) {
                std::cerr << "Error while receiving a frame from the decoder: " << ret << std::endl;
                return false;
            }
            // sucessfully decoded new frame, get motion vectors
            else {

                AVFrameSideData *sd = av_frame_get_side_data(this->frame, AV_FRAME_DATA_MOTION_VECTORS);
                if (sd) {
                    AVMotionVector *mvs = (AVMotionVector *)sd->data;

                    *num_mvs = sd->size / sizeof(*mvs);

                    if (*num_mvs > 0) {

                        // allocate memory for motion vectors as 1D array
                        if (!(*motion_vectors = (MVS_DTYPE *) malloc(*num_mvs * 10 * sizeof(MVS_DTYPE)))) {                             std::cerr << "Memory allocation for motion vectors failed." << std::endl;
                            return false;
                        }

                        // store the motion vectors in the allocated memory (C contiguous)
                        for (MVS_DTYPE i = 0; i < *num_mvs; ++i) {
                            *(*motion_vectors + i*10     ) = static_cast<MVS_DTYPE>(mvs[i].source);                             *(*motion_vectors + i*10 +  1) = static_cast<MVS_DTYPE>(mvs[i].w);                             *(*motion_vectors + i*10 +  2) = static_cast<MVS_DTYPE>(mvs[i].h);                             *(*motion_vectors + i*10 +  3) = static_cast<MVS_DTYPE>(mvs[i].src_x);                             *(*motion_vectors + i*10 +  4) = static_cast<MVS_DTYPE>(mvs[i].src_y);                             *(*motion_vectors + i*10 +  5) = static_cast<MVS_DTYPE>(mvs[i].dst_x);                             *(*motion_vectors + i*10 +  6) = static_cast<MVS_DTYPE>(mvs[i].dst_y);                             *(*motion_vectors + i*10 +  7) = static_cast<MVS_DTYPE>(mvs[i].motion_x);                             *(*motion_vectors + i*10 +  8) = static_cast<MVS_DTYPE>(mvs[i].motion_y);                             *(*motion_vectors + i*10 +  9) = static_cast<MVS_DTYPE>(mvs[i].motion_scale);                             //*(*motion_vectors + i*11 + 10) = static_cast<MVS_DTYPE>(mvs[i].flags);
                        }
                    }
                }

                // convert AVFrame to numpy ndarray
                if(!this->frame_to_buffer(data, step, width, height, cn)) {
                    std::cerr << "Conversion of frame failed." << std::endl;
                    return false;
                }

                // get frame type (I, P, B, etc.) and create a null terminated c-string                 frame_type[0] = av_get_picture_type_char(this->frame->pict_type);
                frame_type[1] = '\0';
            }
        }

        return true;
    }

};



//##############################################################################
//
//         MAIN
//
//##############################################################################


void draw_motion_vectors(cv::Mat frame, std::vector<AVMotionVector> *motion_vectors) {     for (std::vector<AVMotionVector>::size_type i = 0; i < motion_vectors->size(); i++) {
        cv::Point start_pt, end_pt;
        start_pt.y = (*motion_vectors)[i].src_y;
        start_pt.x = (*motion_vectors)[i].src_x;
        end_pt.y = (*motion_vectors)[i].dst_y;
        end_pt.x =  (*motion_vectors)[i].dst_x;
        cv::arrowedLine(frame, start_pt, end_pt, cv::Scalar(0, 0, 255), 1, cv::LINE_AA, 0, 0.1);
    }
}


int main(int argc, char **argv)
{
    // filename of the video file
    const char *url = "vid.mp4";

    VideoCap cap;

    // open the video file
    bool ret = cap.open(url);
    if (!ret) {
        std::cerr << "Could not open the video url" << std::endl;
        return -1;
    }

    // continuously read and display video frames and motion vectors
    while(1) {

        std::cout << "##########################" << std::endl;

        cv::Mat frame;
        MVS_DTYPE *motion_vectors = NULL;
        MVS_DTYPE num_mvs = 0;
        char frame_type[2] = "?";

        auto start = std::chrono::high_resolution_clock::now();

        // read next video frame and corresponding motion vectors
        bool ret = cap.read(frame, frame_type, &motion_vectors, &num_mvs);

        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;
        std::cout << "Elapsed time: " << elapsed.count() << " s\n";

        std::chrono::milliseconds timespan(25);
        std::this_thread::sleep_for(timespan);

        // if there is an error reading the frame
        if(!ret) {
            std::cerr << "Could not read the next frame" << std::endl;
            return -1;
        }
        else {

            // if the frame is not empty
            cv::Size s = frame.size();
            if (s.height > 0 && s.width > 0) {

                // print type of frame (I, P, B, etc)
                std::cout << "Frame type: " << frame_type << std::endl;

                // print motion vectors
                /*for (std::vector<AVMotionVector>::size_type i = 0; i < motion_vectors.size(); i++) {                     std::cout << std::setw(7) << "src: " << motion_vectors[i].source                             << std::setw(6) << "w: " << static_cast<int16_t>(motion_vectors[i].w)                             << std::setw(6) << "h: " << static_cast<int16_t>(motion_vectors[i].h)                             << std::setw(10) << "src_x: " << motion_vectors[i].src_x                             << std::setw(10) << "src_y: " << motion_vectors[i].src_y                             << std::setw(10) << "dst_x: " << motion_vectors[i].dst_x                             << std::setw(10) << "dst_y: " << motion_vectors[i].dst_y                             << std::setw(10) << "mot_x: " << motion_vectors[i].motion_x                             << std::setw(12) << "mot_y: " << motion_vectors[i].motion_y                             << std::setw(12) << "mot_scl: " << motion_vectors[i].motion_scale                             << std::setw(9) << "flags: " << motion_vectors[i].flags << std::endl;
                }*/

                //draw_motion_vectors(frame, &motion_vectors);

                // show frame
                cv::imshow("Frame", frame);

                if (motion_vectors)
                    free(motion_vectors);
                    motion_vectors = NULL;

                // if user presses "ESC" stop program
                char c=(char)cv::waitKey(1);
                if(c==27) {
                    break;
                }
            }
        }
    }


    // when everything done, release the video capture object
    cap.release();

    // close the GUI window
    cv::destroyAllWindows();

    return 0;
}

_______________________________________________
Libav-user mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/libav-user

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Reply via email to