[Libav-user] Why does sws_scale runtime depend on calling frequency?

lukas Thu, 30 May 2019 03:00:24 -0700

Dear libav-users,

I wrote a program which extracts motion vectors from a video stream andencountered the problem that the runtime of sws_scale changes dependingon whether I put a sleep command in the main loop of the caller. If nosleep command is present, sws_scale returns after ca. 0.9 ms on mymachine. With a sleep command of arbitrary length (I tested 1ms, 25ms,500ms and 1s) the runtime is around 7 ms.

I am using libswscale as shipped in FFMPEG 4.1 and my implementation issimilar to the code used in OpenCV VideoCapture(https://github.com/opencv/opencv/blob/master/modules/videoio/src/cap_ffmpeg_impl.hpp#L431)

I would be glad if someone could provide me with at least an idea ofwhat is going wrong here. My code is attached below.


Best regards,

Lukas

// Compile command: g++ -I ~/boost -I /usr/include/python3.6m/ -fpicvideo_cap.cpp -o main -L ~/boost/stage/lib -lboost_python36-lboost_numpy36 -lpython3.6m `pkg-config --cflags --libs libavformatlibswscale opencv4` -Wl,-Bsymbolic


#include <thread>
#include <iostream>
#include <vector>
#include <chrono>


#include <opencv2/opencv.hpp>
#include <opencv2/core/types.hpp>
#include <opencv2/imgproc.hpp>

// FFMPEG
extern "C" {
#include <libavutil/motion_vector.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavutil/imgutils.h>
}

// for changing the dtype of motion vector
#define MVS_DTYPE int32_t
#define MVS_DTYPE_NP NPY_INT32


struct Image_FFMPEG
{
    unsigned char* data;
    int step;
    int width;
    int height;
    int cn;
};


class VideoCap {

private:

    const char *url;
    AVDictionary *opts;
    AVCodec *codec;
    AVFormatContext *fmt_ctx;
    AVCodecContext *video_dec_ctx;
    AVStream *video_stream;
    int video_stream_idx;
    AVFrame *frame;
    AVFrame rgb_frame;
    Image_FFMPEG picture;
    struct SwsContext *img_convert_ctx;


public:

    VideoCap() {
        this->opts = NULL;
        this->codec = NULL;
        this->fmt_ctx = NULL;
        this->video_dec_ctx = NULL;
        this->video_stream = NULL;
        this->video_stream_idx = -1;
        this->frame = NULL;
        this->img_convert_ctx = NULL;

        memset(&(this->rgb_frame), 0, sizeof(this->rgb_frame));
        memset(&(this->picture), 0, sizeof(this->picture));
    }


    void free_all() {
        if (this->img_convert_ctx) {
            sws_freeContext(this->img_convert_ctx);
            this->img_convert_ctx = 0;
        }

        if (this->frame)
            av_frame_free(&(this->frame));

        av_frame_unref(&(this->rgb_frame));

        if (this->video_dec_ctx)
            avcodec_free_context(&(this->video_dec_ctx));

        if (this->fmt_ctx)
            avformat_close_input(&(this->fmt_ctx));
    }


    void release(void) {
        this->free_all();
    }


    bool open(const char *url) {
        this->url = url;
        int ret;

        // open RTSP stream with TCP
        av_dict_set(&(this->opts), "rtsp_transport", "tcp", 0);

ret = avformat_open_input(&(this->fmt_ctx), url, NULL,&(this->opts));

        if (ret < 0) {

std::cerr << "Could not open source file ' " << url << "'"<< std::endl;

            return false;
        }

        // read packets of a media file to get stream information.
        ret = avformat_find_stream_info(this->fmt_ctx, NULL);
        if (ret < 0) {
            std::cerr << "Could not find stream information" << std::endl;
            return false;
        }

        ret = this->open_codec_context(this->fmt_ctx, AVMEDIA_TYPE_VIDEO);
        if (!ret) {
            std::cerr << "Could not create codex context" << std::endl;
            return false;
        }

// print info (duration, bitrate, streams, container, programs,metadata, side data, codec, time base)

        av_dump_format(this->fmt_ctx, 0, url, 0);

        if (!this->video_stream) {

std::cerr << "Could not find video stream in the input,aborting" << std::endl;

            this->free_all();
            return false;
        }

        this->frame = av_frame_alloc();
        if (!this->frame) {
            std::cerr << "Could not allocate frame" << std::endl;
            this->free_all();
            return false;
        }

        return true;
    }

bool open_codec_context(AVFormatContext *fmt_ctx, enum AVMediaTypetype) { // find the most suitable stream of given type (e.g. video) andset the codec accordingly int ret = av_find_best_stream(fmt_ctx, type, -1, -1,&(this->codec), 0);

        if (ret < 0) {

std::cerr << "Could not find " <<av_get_media_type_string(type) << " stream in input file '" << this->url<< "'" << std::endl;

            return false;
        }
        else {
            // set stream in format context
            this->video_stream_idx = ret;
            AVStream *st = fmt_ctx->streams[this->video_stream_idx];

// allocate an AVCodecContext and set its fields to defaultvalues

            this->video_dec_ctx = avcodec_alloc_context3(this->codec);
            if (!this->video_dec_ctx) {
                std::cerr << "Failed to allocate codec" << std::endl;
                return false;
            }

// fill the codec context based on the values from thesupplied codec parameters ret = avcodec_parameters_to_context(this->video_dec_ctx,st->codecpar);

            if (ret < 0) {

std::cerr << "Failed to copy codec parameters to codeccontext" << std::endl;

                return false;
            }

this->video_dec_ctx->thread_count =std::thread::hardware_concurrency(); std::cerr << "Using parallel processing with " <<this->video_dec_ctx->thread_count << " threads" << std::endl;


            // backup encoder's width/height
            int enc_width = this->video_dec_ctx->width;
            int enc_height = this->video_dec_ctx->height;

// Init the video decoder with the codec and set additionaloption to extract motion vectors

            av_dict_set(&(this->opts), "flags2", "+export_mvs", 0);

ret = avcodec_open2(this->video_dec_ctx, this->codec,&(this->opts));

            if (ret < 0) {

std::cerr << "Failed to open " <<av_get_media_type_string(type) << " codec" << std::endl;

                return false;
            }

            this->video_stream = fmt_ctx->streams[this->video_stream_idx];

// checking width/height (since decoder can sometimes alterit, eg. vp6f)

            if (enc_width && (this->video_dec_ctx->width != enc_width)) {
                this->video_dec_ctx->width = enc_width;
            }

if (enc_height && (this->video_dec_ctx->height !=enc_height)) {

                this->video_dec_ctx->height = enc_height;
            }

            this->picture.width = this->video_dec_ctx->width;
            this->picture.height = this->video_dec_ctx->height;
            this->picture.cn = 3;
            this->picture.step = 0;
            this->picture.data = NULL;
        }

        return true;
    }

bool read(cv::OutputArray cv_frame, char *frame_type, MVS_DTYPE**motion_vectors, MVS_DTYPE *num_mvs) {


        uint8_t* data = 0;
        int step = 0, width = 0, height = 0, cn = 0;

        // loop over different streams (video, audio) in the file
        while(1) {
            AVPacket pkt = { 0 };

            // read next packet from the stream
            int ret = av_read_frame(this->fmt_ctx, &pkt);
            if (ret < 0) {
                return false;
            }

// if the packet is not from the video stream don't doanything and get next packet

            if (pkt.stream_index != this->video_stream_idx) {
                continue;
            }
            // if the packet is from the video stream send it to decoder
            else {

bool ret = this->decode_packet(&pkt, &data, &step,&width, &height, &cn, frame_type, motion_vectors, num_mvs);

                if (!ret) {
                    return false;
                }

cv::Mat(height, width, CV_MAKETYPE(CV_8U, cn), data,step).copyTo(cv_frame);


                av_packet_unref(&pkt);

                return true;
            }
        }
    }

bool frame_to_buffer(uint8_t** data, int* step, int* width, int*height, int* cn)

    {
        if (!this->video_stream || !(this->frame->data[0])) {
            return false;
        }

        if (this->img_convert_ctx == NULL ||
            this->picture.width != this->video_dec_ctx->width ||
            this->picture.height != this->video_dec_ctx->height ||
            this->picture.data == NULL) {

// Some sws_scale optimizations have some assumptions aboutalignment of data/step/width/height // Also we use coded_width/height to workaround problemwith legacy ffmpeg versions (like n0.8)

            int buffer_width = this->video_dec_ctx->coded_width;
            int buffer_height = this->video_dec_ctx->coded_height;

            this->img_convert_ctx = sws_getCachedContext(
                    this->img_convert_ctx,
                    buffer_width, buffer_height,
                    this->video_dec_ctx->pix_fmt,
                    buffer_width, buffer_height,
                    AV_PIX_FMT_BGR24,
                    SWS_BICUBIC,
                    NULL, NULL, NULL
                    );

            if (this->img_convert_ctx == NULL) {

std::cerr << "Allocation of conversion context failed"<< std::endl;

                return false;
            }

            av_frame_unref(&(this->rgb_frame));
            this->rgb_frame.format = AV_PIX_FMT_BGR24;
            this->rgb_frame.width = buffer_width;
            this->rgb_frame.height = buffer_height;
            if (0 != av_frame_get_buffer(&(this->rgb_frame), 32)) {

std::cerr << "Not enough memory to allocate buffer forframe conversion" << std::endl;

                return false;
            }

            this->picture.width = this->video_dec_ctx->width;
            this->picture.height = this->video_dec_ctx->height;
            this->picture.cn = 3;
            this->picture.data = this->rgb_frame.data[0];
            this->picture.step = this->rgb_frame.linesize[0];
        }

        auto start = std::chrono::high_resolution_clock::now();
        sws_scale(
            this->img_convert_ctx,
            this->frame->data,
            this->frame->linesize,
            0, this->video_dec_ctx->coded_height,
            this->rgb_frame.data,
            this->rgb_frame.linesize
            );
        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;
        std::cout << "sws_scale (C++): " << elapsed.count() << " s\n";

        *data = this->picture.data;
        *step = this->picture.step;
        *width = this->picture.width;
        *height = this->picture.height;
        *cn = this->picture.cn;

        return true;
    }

bool decode_packet(const AVPacket *pkt, uint8_t** data, int* step,int* width, int* height, int* cn, char *frame_type, MVS_DTYPE**motion_vectors, MVS_DTYPE *num_mvs) {

        // send encoded data packet to the decoder
        int ret = avcodec_send_packet(this->video_dec_ctx, pkt);
        if (ret < 0) {

std::cerr << "Error while sending a packet to the decoder:" << ret << std::endl;

            return false;
        }

        // loop over packets until the next frame is fully assembled
        while (ret >= 0)  {
            // try to get the next frame from decoder
            ret = avcodec_receive_frame(this->video_dec_ctx, this->frame);

// failed: end of stream or no frame available, stop andreturn with success

            if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                break;
            }
            // failed: another error occured, return with error
            else if (ret < 0) {

std::cerr << "Error while receiving a frame from thedecoder: " << ret << std::endl;

                return false;
            }
            // sucessfully decoded new frame, get motion vectors
            else {

AVFrameSideData *sd =av_frame_get_side_data(this->frame, AV_FRAME_DATA_MOTION_VECTORS);

                if (sd) {
                    AVMotionVector *mvs = (AVMotionVector *)sd->data;

                    *num_mvs = sd->size / sizeof(*mvs);

                    if (*num_mvs > 0) {

                        // allocate memory for motion vectors as 1D array

if (!(*motion_vectors = (MVS_DTYPE *)malloc(*num_mvs * 10 * sizeof(MVS_DTYPE)))) { std::cerr << "Memory allocation for motionvectors failed." << std::endl;

                            return false;
                        }

// store the motion vectors in the allocatedmemory (C contiguous)

                        for (MVS_DTYPE i = 0; i < *num_mvs; ++i) {

*(*motion_vectors + i*10 ) =static_cast<MVS_DTYPE>(mvs[i].source); *(*motion_vectors + i*10 + 1) =static_cast<MVS_DTYPE>(mvs[i].w); *(*motion_vectors + i*10 + 2) =static_cast<MVS_DTYPE>(mvs[i].h); *(*motion_vectors + i*10 + 3) =static_cast<MVS_DTYPE>(mvs[i].src_x); *(*motion_vectors + i*10 + 4) =static_cast<MVS_DTYPE>(mvs[i].src_y); *(*motion_vectors + i*10 + 5) =static_cast<MVS_DTYPE>(mvs[i].dst_x); *(*motion_vectors + i*10 + 6) =static_cast<MVS_DTYPE>(mvs[i].dst_y); *(*motion_vectors + i*10 + 7) =static_cast<MVS_DTYPE>(mvs[i].motion_x); *(*motion_vectors + i*10 + 8) =static_cast<MVS_DTYPE>(mvs[i].motion_y); *(*motion_vectors + i*10 + 9) =static_cast<MVS_DTYPE>(mvs[i].motion_scale); //*(*motion_vectors + i*11 + 10) =static_cast<MVS_DTYPE>(mvs[i].flags);

                        }
                    }
                }

                // convert AVFrame to numpy ndarray
                if(!this->frame_to_buffer(data, step, width, height, cn)) {

std::cerr << "Conversion of frame failed." <<std::endl;

                    return false;
                }

// get frame type (I, P, B, etc.) and create a nullterminated c-string frame_type[0] =av_get_picture_type_char(this->frame->pict_type);

                frame_type[1] = '\0';
            }
        }

        return true;
    }

};



//##############################################################################
//
//         MAIN
//
//##############################################################################

void draw_motion_vectors(cv::Mat frame, std::vector<AVMotionVector>*motion_vectors) { for (std::vector<AVMotionVector>::size_type i = 0; i <motion_vectors->size(); i++) {

        cv::Point start_pt, end_pt;
        start_pt.y = (*motion_vectors)[i].src_y;
        start_pt.x = (*motion_vectors)[i].src_x;
        end_pt.y = (*motion_vectors)[i].dst_y;
        end_pt.x =  (*motion_vectors)[i].dst_x;

cv::arrowedLine(frame, start_pt, end_pt, cv::Scalar(0, 0, 255),1, cv::LINE_AA, 0, 0.1);

    }
}


int main(int argc, char **argv)
{
    // filename of the video file
    const char *url = "vid.mp4";

    VideoCap cap;

    // open the video file
    bool ret = cap.open(url);
    if (!ret) {
        std::cerr << "Could not open the video url" << std::endl;
        return -1;
    }

    // continuously read and display video frames and motion vectors
    while(1) {

        std::cout << "##########################" << std::endl;

        cv::Mat frame;
        MVS_DTYPE *motion_vectors = NULL;
        MVS_DTYPE num_mvs = 0;
        char frame_type[2] = "?";

        auto start = std::chrono::high_resolution_clock::now();

        // read next video frame and corresponding motion vectors
        bool ret = cap.read(frame, frame_type, &motion_vectors, &num_mvs);

        auto finish = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = finish - start;
        std::cout << "Elapsed time: " << elapsed.count() << " s\n";

        std::chrono::milliseconds timespan(25);
        std::this_thread::sleep_for(timespan);

        // if there is an error reading the frame
        if(!ret) {
            std::cerr << "Could not read the next frame" << std::endl;
            return -1;
        }
        else {

            // if the frame is not empty
            cv::Size s = frame.size();
            if (s.height > 0 && s.width > 0) {

                // print type of frame (I, P, B, etc)
                std::cout << "Frame type: " << frame_type << std::endl;

                // print motion vectors

/*for (std::vector<AVMotionVector>::size_type i = 0; i< motion_vectors.size(); i++) { std::cout << std::setw(7) << "src: " <<motion_vectors[i].source << std::setw(6) << "w: " <<static_cast<int16_t>(motion_vectors[i].w) << std::setw(6) << "h: " <<static_cast<int16_t>(motion_vectors[i].h) << std::setw(10) << "src_x: " <<motion_vectors[i].src_x << std::setw(10) << "src_y: " <<motion_vectors[i].src_y << std::setw(10) << "dst_x: " <<motion_vectors[i].dst_x << std::setw(10) << "dst_y: " <<motion_vectors[i].dst_y << std::setw(10) << "mot_x: " <<motion_vectors[i].motion_x << std::setw(12) << "mot_y: " <<motion_vectors[i].motion_y << std::setw(12) << "mot_scl: " <<motion_vectors[i].motion_scale << std::setw(9) << "flags: " <<motion_vectors[i].flags << std::endl;

                }*/

                //draw_motion_vectors(frame, &motion_vectors);

                // show frame
                cv::imshow("Frame", frame);

                if (motion_vectors)
                    free(motion_vectors);
                    motion_vectors = NULL;

                // if user presses "ESC" stop program
                char c=(char)cv::waitKey(1);
                if(c==27) {
                    break;
                }
            }
        }
    }


    // when everything done, release the video capture object
    cap.release();

    // close the GUI window
    cv::destroyAllWindows();

    return 0;
}

_______________________________________________
Libav-user mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/libav-user

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[Libav-user] Why does sws_scale runtime depend on calling frequency?

Reply via email to