避免线程本地存储开销（使 ffmpeg YADIF 可扩展）

发布于 2024-11-27 15:20:09 字数 4254 浏览 5 评论 0原文

我正在尝试创建一个小的 ffmpeg“hack”，它可以并行执行 yadif 过滤器。

我想我已经找到了一种解决方案，但是它只能有一个并发实例。这是因为“scalable_yadif_context”是函数“scalable_yadif_filter_line1”的本地函数，它取代了原始的 yadif“filter_line”函数。我可以将“scalable_yadif_context”线程设置为本地线程，但是由于经常调用该函数，因此它会产生相当高的开销。

关于如何解决这个问题有什么想法吗？

// We need the context description in order to access the original filter_line function. Just redefine it here and hope that it is not changed inside of libavfilter.
typedef struct {
    int mode;
    int parity;
    int frame_pending;
    int auto_enable;
    AVFilterBufferRef *cur;
    AVFilterBufferRef *next;
    AVFilterBufferRef *prev;
    AVFilterBufferRef *out;
    void (*filter_line)(uint8_t *dst,
                        uint8_t *prev, uint8_t *cur, uint8_t *next,
                        int w, int prefs, int mrefs, int parity, int mode);
    const AVPixFmtDescriptor *csp;
} YADIFContext;

struct scalable_yadif_context
{
    std::vector<std::function<void()>> calls;
    int end_prefs;

    scalable_yadif_context() : end_prefs(std::numeric_limits<int>::max()){}
};

void (*org_yadif_filter_line)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) = 0;

void scalable_yadif_filter_line(scalable_yadif_context& ctx, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode)
{
    if(ctx.end_prefs == std::numeric_limits<int>::max())
        ctx.end_prefs = -prefs;  // Last call to filter_line will have negative pref

    ctx.calls.push_back([=]
    {
        org_yadif_filter_line(dst, prev, cur, next, w, prefs, mrefs, parity, mode);
    });    

    if(prefs == ctx.end_prefs)
    {       
        tbb::parallel_for(tbb::blocked_range<size_t>(0, ctx.calls.size()), [=](const tbb::blocked_range<size_t>& r)
        {
            for(auto n = r.begin(); n != r.end(); ++n)
                ctx.calls[n]();
        });
        ctx.calls.clear();
        ctx.end_prefs = std::numeric_limits<int>::max();
    }
}

void scalable_yadif_filter_line1(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode)
{
     // local to the current function, making this thread local would incur heavy overhead.
    static scalable_yadif_context ctx;
    scalable_yadif_filter_line(ctx, dst, prev, cur, next, w, prefs, mrefs, parity, mode);
}

void make_scalable_yadif(AVFilterContext* ctx)
{
    YADIFContext* yadif = (YADIFContext*)ctx->priv;

    // Data race should not be problem since we are always writing the same value
    org_yadif_filter_line = yadif->filter_line;

    // hmm, will only work for one concurrent instance... 
    // I need a unique "scalable_yadif_filter_line1" for each call...
    yadif->filter_line = scalable_yadif_filter_line1; 
}

我创建了一个极其难看的解决方案，最多可支持 18 个并发实例。

#define RENAME(a) f ## a

#define ff(x) \
void RENAME(x)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) \
{\
    static scalable_yadif_context ctx;\
    scalable_yadif_filter_line(ctx, dst, prev, cur, next, w, prefs, mrefs, parity, mode);\
}

ff(0); ff(1); ff(2); ff(3); ff(4); ff(5); ff(6); ff(7); ff(8); ff(9); ff(10); ff(11); ff(12); ff(13); ff(14); ff(15); ff(16); ff(17);

void (*fs[])(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) = 

{f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17};

namespace caspar {

void init()
{
    for(int n = 0; n < 18; ++n)
        tags.push(n);
}

int make_scalable_yadif(AVFilterContext* ctx)
{
    static boost::once_flag flag = BOOST_ONCE_INIT;
    boost::call_once(&init, flag);

    YADIFContext* yadif = (YADIFContext*)ctx->priv;
    org_yadif_filter_line = yadif->filter_line;

    int tag;
    if(!tags.try_pop(tag))
    {
        LOG(warning) << "Not enough scalable-yadif instances. Running non-scalable";
        return -1;
    }

    yadif->filter_line = fs[tag];
    return tag;
}

void release_scalable_yadif(int tag)
{
    if(tag != -1)
        tags.push(tag);
}

原文

I'm trying to create a small ffmpeg "hack" which enables parallel executin of the yadif filter.

I think I have found a solution, however there can only be one concurrent instance of it. This is because the "scalable_yadif_context" is local to the function "scalable_yadif_filter_line1" which replaces the original yadif "filter_line" function. I could make the "scalable_yadif_context" thread local, however since this function is called often it would have a quite high overhead.

Any ideas as how to solve this issue?

// We need the context description in order to access the original filter_line function. Just redefine it here and hope that it is not changed inside of libavfilter.
typedef struct {
    int mode;
    int parity;
    int frame_pending;
    int auto_enable;
    AVFilterBufferRef *cur;
    AVFilterBufferRef *next;
    AVFilterBufferRef *prev;
    AVFilterBufferRef *out;
    void (*filter_line)(uint8_t *dst,
                        uint8_t *prev, uint8_t *cur, uint8_t *next,
                        int w, int prefs, int mrefs, int parity, int mode);
    const AVPixFmtDescriptor *csp;
} YADIFContext;

struct scalable_yadif_context
{
    std::vector<std::function<void()>> calls;
    int end_prefs;

    scalable_yadif_context() : end_prefs(std::numeric_limits<int>::max()){}
};

void (*org_yadif_filter_line)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) = 0;

void scalable_yadif_filter_line(scalable_yadif_context& ctx, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode)
{
    if(ctx.end_prefs == std::numeric_limits<int>::max())
        ctx.end_prefs = -prefs;  // Last call to filter_line will have negative pref

    ctx.calls.push_back([=]
    {
        org_yadif_filter_line(dst, prev, cur, next, w, prefs, mrefs, parity, mode);
    });    

    if(prefs == ctx.end_prefs)
    {       
        tbb::parallel_for(tbb::blocked_range<size_t>(0, ctx.calls.size()), [=](const tbb::blocked_range<size_t>& r)
        {
            for(auto n = r.begin(); n != r.end(); ++n)
                ctx.calls[n]();
        });
        ctx.calls.clear();
        ctx.end_prefs = std::numeric_limits<int>::max();
    }
}

void scalable_yadif_filter_line1(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode)
{
     // local to the current function, making this thread local would incur heavy overhead.
    static scalable_yadif_context ctx;
    scalable_yadif_filter_line(ctx, dst, prev, cur, next, w, prefs, mrefs, parity, mode);
}

void make_scalable_yadif(AVFilterContext* ctx)
{
    YADIFContext* yadif = (YADIFContext*)ctx->priv;

    // Data race should not be problem since we are always writing the same value
    org_yadif_filter_line = yadif->filter_line;

    // hmm, will only work for one concurrent instance... 
    // I need a unique "scalable_yadif_filter_line1" for each call...
    yadif->filter_line = scalable_yadif_filter_line1; 
}

I've created an extremely ugly solution that works for up to 18 concurrent instances.

#define RENAME(a) f ## a

#define ff(x) \
void RENAME(x)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) \
{\
    static scalable_yadif_context ctx;\
    scalable_yadif_filter_line(ctx, dst, prev, cur, next, w, prefs, mrefs, parity, mode);\
}

ff(0); ff(1); ff(2); ff(3); ff(4); ff(5); ff(6); ff(7); ff(8); ff(9); ff(10); ff(11); ff(12); ff(13); ff(14); ff(15); ff(16); ff(17);

void (*fs[])(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) = 

{f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17};

namespace caspar {

void init()
{
    for(int n = 0; n < 18; ++n)
        tags.push(n);
}

int make_scalable_yadif(AVFilterContext* ctx)
{
    static boost::once_flag flag = BOOST_ONCE_INIT;
    boost::call_once(&init, flag);

    YADIFContext* yadif = (YADIFContext*)ctx->priv;
    org_yadif_filter_line = yadif->filter_line;

    int tag;
    if(!tags.try_pop(tag))
    {
        LOG(warning) << "Not enough scalable-yadif instances. Running non-scalable";
        return -1;
    }

    yadif->filter_line = fs[tag];
    return tag;
}

void release_scalable_yadif(int tag)
{
    if(tag != -1)
        tags.push(tag);
}

分享到QQ

分享到微博