~ft/dav1d

6b5e8cc587d544ab399488fa032640268856da9c — Sigrid Solveig Haflínudóttir 52 years ago 3b14f94 master
inline and unroll more
3 files changed, 34 insertions(+), 67 deletions(-)

M include/common/intops.h
M src/looprestoration_tmpl.c
M src/mc_tmpl.c
M include/common/intops.h => include/common/intops.h +10 -45
@@ 31,54 31,19 @@
#include <stdint.h>

#include "common/attributes.h"

static inline int imax(const int a, const int b) {
    return a > b ? a : b;
}

static inline int imin(const int a, const int b) {
    return a < b ? a : b;
}

static inline unsigned umax(const unsigned a, const unsigned b) {
    return a > b ? a : b;
}

static inline unsigned umin(const unsigned a, const unsigned b) {
    return a < b ? a : b;
}
#define imax(a, b) (int)((int)(a) > (int)(b) ? (a) : (b))
#define imin(a, b) (int)((int)(a) < (int)(b) ? (a) : (b))
#define umax(a, b) (unsigned)((unsigned)(a) > (unsigned)(b) ? (a) : (b))
#define umin(a, b) (unsigned)((unsigned)(a) < (unsigned)(b) ? (a) : (b))
#define iclip_u8(v) iclip((v), 0, 255)
#define apply_sign(v, s) ((int)(s) < 0 ? -(int)(v) : (int)(v))
#define apply_sign64(v, s) ((int64_t)(s) < 0 ? -(int)(v) : (int)(v))
#define ulog2(v) (int)(31 - clz((unsigned)(v)))
#define u64log2(v) (int)(63 - clzll((uint64_t)(v)))
#define inv_recenter(r, v) (unsigned)((unsigned)(v) > ((unsigned)(r)<<1) ? (v) : (((v)&1) == 0) ? (((unsigned)(v)>>1) + (unsigned)(r)) : ((unsigned)(r) - (((unsigned)(v)+1)>>1)))

static inline int iclip(const int v, const int min, const int max) {
    return v < min ? min : v > max ? max : v;
}

static inline int iclip_u8(const int v) {
    return iclip(v, 0, 255);
}

static inline int apply_sign(const int v, const int s) {
    return s < 0 ? -v : v;
}

static inline int apply_sign64(const int v, const int64_t s) {
    return s < 0 ? -v : v;
}

static inline int ulog2(const unsigned v) {
    return 31 - clz(v);
}

static inline int u64log2(const uint64_t v) {
    return 63 - clzll(v);
}

static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
    if (v > (r << 1))
        return v;
    else if ((v & 1) == 0)
        return (v >> 1) + r;
    else
        return r - ((v + 1) >> 1);
}

#endif /* DAV1D_COMMON_INTOPS_H */

M src/looprestoration_tmpl.c => src/looprestoration_tmpl.c +4 -11
@@ 156,14 156,11 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
    for (int j = 0; j < h + 6; j++) {
        for (int i = 0; i < w; i++) {
            int sum = (1 << (bitdepth + 6));
            int sum = (1 << (bitdepth + 6)) +
#if BITDEPTH == 8
            sum += tmp_ptr[i + 3] * 128;
            tmp_ptr[i + 3] * 128 +
#endif

            for (int k = 0; k < 7; k++) {
                sum += tmp_ptr[i + k] * filter[0][k];
            }
			(((tmp_ptr[i+0]*filter[0][0] + tmp_ptr[i+1]*filter[0][1]) + (tmp_ptr[i+2]*filter[0][2] + tmp_ptr[i+3]*filter[0][3])) + (tmp_ptr[i+4]*filter[0][4] + tmp_ptr[i+5]*filter[0][5])) + tmp_ptr[i+6]*filter[0][6];

            hor_ptr[i] =
                iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);


@@ 177,11 174,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
    for (int j = 0; j < h; j++) {
        for (int i = 0; i < w; i++) {
            int sum = -round_offset;

            for (int k = 0; k < 7; k++) {
                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
            }
            int sum = -round_offset + hor[(j+0)*REST_UNIT_STRIDE+i]*filter[1][0] + hor[(j+1)*REST_UNIT_STRIDE+i]*filter[1][1] + hor[(j+2)*REST_UNIT_STRIDE+i]*filter[1][2] + hor[(j+3)*REST_UNIT_STRIDE+i]*filter[1][3] + hor[(j+4)*REST_UNIT_STRIDE+i]*filter[1][4] + hor[(j+5)*REST_UNIT_STRIDE+i]*filter[1][5] + hor[(j+6)*REST_UNIT_STRIDE+i]*filter[1][6];

            p[j * PXSTRIDE(p_stride) + i] =
                iclip_pixel((sum + rounding_off_v) >> round_bits_v);

M src/mc_tmpl.c => src/mc_tmpl.c +20 -11
@@ 809,25 809,34 @@ static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,

    src -= 3 * PXSTRIDE(src_stride);
    for (int y = 0; y < 15; y++, mx += abcd[1]) {
        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
            const int8_t *const filter =
                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
        int tmx = mx;

        mid_ptr[0] = FILTER_WARP_RND(src, 0, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[1] = FILTER_WARP_RND(src, 1, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[2] = FILTER_WARP_RND(src, 2, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[3] = FILTER_WARP_RND(src, 3, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[4] = FILTER_WARP_RND(src, 4, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[5] = FILTER_WARP_RND(src, 5, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[6] = FILTER_WARP_RND(src, 6, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
        mid_ptr[7] = FILTER_WARP_RND(src, 7, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
                                         7 - intermediate_bits);
        }
        src += PXSTRIDE(src_stride);
        mid_ptr += 8;
    }

    mid_ptr = &mid[3 * 8];
    for (int y = 0; y < 8; y++, my += abcd[3]) {
        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
            const int8_t *const filter =
                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
        int tmy = my;

        tmp[0] = FILTER_WARP_RND(mid_ptr, 0, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[1] = FILTER_WARP_RND(mid_ptr, 1, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[2] = FILTER_WARP_RND(mid_ptr, 2, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[3] = FILTER_WARP_RND(mid_ptr, 3, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[4] = FILTER_WARP_RND(mid_ptr, 4, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[5] = FILTER_WARP_RND(mid_ptr, 5, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[6] = FILTER_WARP_RND(mid_ptr, 6, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
        tmp[7] = FILTER_WARP_RND(mid_ptr, 7, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

            tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
        }
        mid_ptr += 8;
        tmp += tmp_stride;
    }