改进 SSE (SSSE3) YUV 到 RGB 代码

发布于 2024-10-09 14:11:02 字数 6809 浏览 5 评论 0原文

我正在寻找优化我为将 YUV 转换为 RGB(平面和打包 YUV 函数)而编写的一些 SSE 代码。

我目前使用的是 SSSE3,但如果后续 SSE 版本中有有用的功能,那就没问题了。

我主要感兴趣的是如何解决处理器停顿等问题。

有人知道有哪些工具可以对 SSE 代码进行静态分析吗?

;
; Copyright (C) 2009-2010 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;

; A rather unoptimised set of ssse3 yuv to rgb converters
; does 8 pixels per loop

; inputer:
; reads 128 bits of yuv 8 bit data and puts
; the y values converted to 16 bit in xmm0
; the u values converted to 16 bit and duplicated into xmm1
; the v values converted to 16 bit and duplicated into xmm2

; conversion:
; does the yuv to rgb conversion using 16 bit integer and the
; results are placed into the following registers as 8 bit clamped values
; r values in xmm3
; g values in xmm4
; b values in xmm5

; outputer:
; writes out the rgba pixels as 8 bit values with 0 for alpha

; xmm6 used for scratch
; xmm7 used for scratch

%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro

; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm5,xmm0, 0xE4 ; b = y

; r = y + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm7, xmm1 ; move u to scratch
pshufd xmm6, xmm2, 0xE4 ; move v to scratch

psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide v by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r

; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,4 ; divide u by 32
paddsw xmm5, xmm7 ; and add to b

; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm7,xmm2 ; move v to scratch
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
movdqa xmm4,xmm0 ; g = y

psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide u by 2
psubsw xmm4,xmm6 ; subtract from g

psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,2 ; divide v by 4
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
%endmacro

; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro

SECTION .data align=16

Const16 dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16

Const128 dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128

UMask db 0x01
db 0x80
db 0x01
db 0x80
db 0x05
db 0x80
db 0x05
db 0x80
db 0x09
db 0x80
db 0x09
db 0x80
db 0x0d
db 0x80
db 0x0d
db 0x80

VMask db 0x03
db 0x80
db 0x03
db 0x80
db 0x07
db 0x80
db 0x07
db 0x80
db 0x0b
db 0x80
db 0x0b
db 0x80
db 0x0f
db 0x80
db 0x0f
db 0x80

YMask db 0x00
db 0x80
db 0x02
db 0x80
db 0x04
db 0x80
db 0x06
db 0x80
db 0x08
db 0x80
db 0x0a
db 0x80
db 0x0c
db 0x80
db 0x0e
db 0x80

; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
width equ ebp+16
toPtr equ ebp+12
fromPtr equ ebp+8

; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
width1 equ ebp+24
toPtr1 equ ebp+20
fromVPtr equ ebp+16
fromUPtr equ ebp+12
fromYPtr equ ebp+8

SECTION .text align=16

cglobal Convert_YUV422_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx

mov esi, [fromPtr]
mov edi, [toPtr]
mov ecx, [width]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
; extract both y giving y0y0
pshufb xmm0, [YMask]
; extract u and duplicate so each u in yuyv becomes u0u0
pshufb xmm1, [UMask]
; extract v and duplicate so each v in yuyv becomes v0v0
pshufb xmm2, [VMask]

yuv2rgbsse2

rgba32sse2output

; endloop
add edi,32
add esi,16
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP
ENDLOOP:
; Cleanup
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret

cglobal Convert_YUV420P_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx

mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP1
REPEATLOOP1: ; loop over width / 8
; YUV420 Planar inputer movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000

; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0

yuv2rgbsse2

rgba32sse2output

; endloop
add edi,32
add esi,8
add eax,4
add ebx,4
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP1
ENDLOOP1:
; Cleanup
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret

SECTION .note.GNU-stack noalloc noexec nowrite progbits

I am looking to optimise some SSE code I wrote for converting YUV to RGB (both planar and packed YUV functions).

I am using SSSE3 at the moment, but if there are useful functions from later SSE versions that's ok.

I am mainly interested in how I would work out processor stalls and the like.

Anyone know of any tools that do static analysis of SSE code?

;
; Copyright (C) 2009-2010 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;

; A rather unoptimised set of ssse3 yuv to rgb converters
; does 8 pixels per loop

; inputer:
; reads 128 bits of yuv 8 bit data and puts
; the y values converted to 16 bit in xmm0
; the u values converted to 16 bit and duplicated into xmm1
; the v values converted to 16 bit and duplicated into xmm2

; conversion:
; does the yuv to rgb conversion using 16 bit integer and the
; results are placed into the following registers as 8 bit clamped values
; r values in xmm3
; g values in xmm4
; b values in xmm5

; outputer:
; writes out the rgba pixels as 8 bit values with 0 for alpha

; xmm6 used for scratch
; xmm7 used for scratch

%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro

; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm5,xmm0, 0xE4 ; b = y

; r = y + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm7, xmm1 ; move u to scratch
pshufd xmm6, xmm2, 0xE4 ; move v to scratch

psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide v by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r

; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,4 ; divide u by 32
paddsw xmm5, xmm7 ; and add to b

; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm7,xmm2 ; move v to scratch
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
movdqa xmm4,xmm0 ; g = y

psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide u by 2
psubsw xmm4,xmm6 ; subtract from g

psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,2 ; divide v by 4
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
%endmacro

; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro

SECTION .data align=16

Const16 dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16

Const128 dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128

UMask db 0x01
db 0x80
db 0x01
db 0x80
db 0x05
db 0x80
db 0x05
db 0x80
db 0x09
db 0x80
db 0x09
db 0x80
db 0x0d
db 0x80
db 0x0d
db 0x80

VMask db 0x03
db 0x80
db 0x03
db 0x80
db 0x07
db 0x80
db 0x07
db 0x80
db 0x0b
db 0x80
db 0x0b
db 0x80
db 0x0f
db 0x80
db 0x0f
db 0x80

YMask db 0x00
db 0x80
db 0x02
db 0x80
db 0x04
db 0x80
db 0x06
db 0x80
db 0x08
db 0x80
db 0x0a
db 0x80
db 0x0c
db 0x80
db 0x0e
db 0x80

; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
width equ ebp+16
toPtr equ ebp+12
fromPtr equ ebp+8

; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
width1 equ ebp+24
toPtr1 equ ebp+20
fromVPtr equ ebp+16
fromUPtr equ ebp+12
fromYPtr equ ebp+8

SECTION .text align=16

cglobal Convert_YUV422_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx

mov esi, [fromPtr]
mov edi, [toPtr]
mov ecx, [width]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
; extract both y giving y0y0
pshufb xmm0, [YMask]
; extract u and duplicate so each u in yuyv becomes u0u0
pshufb xmm1, [UMask]
; extract v and duplicate so each v in yuyv becomes v0v0
pshufb xmm2, [VMask]

yuv2rgbsse2

rgba32sse2output

; endloop
add edi,32
add esi,16
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP
ENDLOOP:
; Cleanup
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret

cglobal Convert_YUV420P_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx

mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP1
REPEATLOOP1: ; loop over width / 8
; YUV420 Planar inputer movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000

; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0

yuv2rgbsse2

rgba32sse2output

; endloop
add edi,32
add esi,8
add eax,4
add ebx,4
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP1
ENDLOOP1:
; Cleanup
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret

SECTION .note.GNU-stack noalloc noexec nowrite progbits

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(4

雪落纷纷 2024-10-16 14:11:02

如果你保留你& v 在一个寄存器中交错,并使用“pmaddwd”和预先计算的常量而不是移位和添加方法,您可以将转换代码压缩到大约三分之一,并同时摆脱大多数停顿:

; xmm0 = y y y y y y y y
; xmm3 = u v u v u v u v

psubsw xmm3, [Const128]
psubsw xmm0, [Const16] 
movdqa xmm4, xmm3
movdqa xmm5, xmm3
pmaddwd xmm3, [const_1]
pmaddwd xmm4, [const_2]
pmaddwd xmm5, [const_3]
psrad xmm3, 14
psrad xmm4, 14
psrad xmm5, 14
pshufb xmm3, xmm3, [const_4] ; or pshuflw & pshufhw
pshufb xmm4, xmm4, [const_4]
pshufb xmm5, xmm5, [const_4]
paddsw xmm3, xmm0
paddsw xmm4, xmm0
paddsw xmm5, xmm0

如果您想要的话为了更快地工作,使用 PMADDUBSW 应该可以让您一次处理 16 个像素,同时复杂性略有增加。

大多数处理器(特别是非 Intel 处理器,因没有运行良好的硬件预取器而臭名昭著,但在较小程度上,Intel 处理器也是如此)将受益于循环内抛出的 prefetchnta [esi+256]。

编辑:使用 PMADDUBSW 的代码可能如下所示(不保证正确性):

const a: 
times 4 db 1,3
times 4 db 5,7
const b: 
times 4 db 9,11
times 4 db 13,15
const_c: times 8 dw 0x00ff
const_d: times 4 dd 0x00ffffff

const_uv_to_rgb_mul:
...
const_uv_to_rgb_add:
...

movdqa xmm4, [esi]
movdqa xmm0, xmm4
movdqa xmm1, xmm4
pshufb xmm0, [const_a] 
pshufb xmm1, [const_b]
pand xmm4, [const_c] 

; xmm0: uv0 uv0 uv0 uv0 uv2 uv2 uv2 uv2 uv2
; xmm1: uv4 uv4 uv4 uv4 ...
; xmm4: y0 0 y1 0 y2 0 y3 0 y4 0 y5 0 y6 0 y7 0

pmaddubsw xmm0, [const_uv_to_rgb_mul]
pmaddubsw xmm1, [const_uv_to_rgb_mul]
paddsw xmm0, [const_uv_to_rgb_add]
paddsw xmm1, [const_uv_to_rgb_add]
psraw xmm0, 6
psraw xmm1, 6

; r01 g01 b01 0 r23 g23 b23 0

pshufd xmm2, xmm0, 2+3*4+2*16+3*64
pshufd xmm0, xmm0, 0+1*4+0+16+1*64
pshufd xmm3, xmm1, 2+3*4+2*16+3*64
pshufd xmm1, xmm1, 0+1*4+0+16+1*64

; xmm0: r01 g01 b01 0 r01 g01 b01 0
; xmm2: r23 g23 b23 0 r23 g23 b23 0
; xmm1: r45 g45 b45 0 r45 g45 b45 0

paddsw xmm0, xmm4 ; add y
paddsw xmm1, xmm4 
paddsw xmm2, xmm4
paddsw xmm3, xmm4

packuswb xmm0, xmm2  ; pack with saturation into 0-255 range
packuswb xmm1, xmm3
pand xmm0, [const_d] ; zero out the alpha byte
pand xmm1, [const_d]
movntdq [edi], xmm0
movntdq [edi+16], xmm1

If you keep u & v interleaved in one register, and use 'pmaddwd' and precomputed constants instead of your shift-and-add approach, you can compress the conversion code to about a third, and get rid of most stalls at the same time:

; xmm0 = y y y y y y y y
; xmm3 = u v u v u v u v

psubsw xmm3, [Const128]
psubsw xmm0, [Const16] 
movdqa xmm4, xmm3
movdqa xmm5, xmm3
pmaddwd xmm3, [const_1]
pmaddwd xmm4, [const_2]
pmaddwd xmm5, [const_3]
psrad xmm3, 14
psrad xmm4, 14
psrad xmm5, 14
pshufb xmm3, xmm3, [const_4] ; or pshuflw & pshufhw
pshufb xmm4, xmm4, [const_4]
pshufb xmm5, xmm5, [const_4]
paddsw xmm3, xmm0
paddsw xmm4, xmm0
paddsw xmm5, xmm0

If you want it to work even faster, playing with PMADDUBSW should allow you to work on 16 pixels at a time with a small increase in complexity.

Most processors (particularly non-Intels, notorious for not having a well-working hardware prefetcher, but, to a lesser extent, Intels too) will benefit from a prefetchnta [esi+256] thrown inside the loop.

EDIT: the code that uses PMADDUBSW could look like this (correctness not guaranteed):

const a: 
times 4 db 1,3
times 4 db 5,7
const b: 
times 4 db 9,11
times 4 db 13,15
const_c: times 8 dw 0x00ff
const_d: times 4 dd 0x00ffffff

const_uv_to_rgb_mul:
...
const_uv_to_rgb_add:
...

movdqa xmm4, [esi]
movdqa xmm0, xmm4
movdqa xmm1, xmm4
pshufb xmm0, [const_a] 
pshufb xmm1, [const_b]
pand xmm4, [const_c] 

; xmm0: uv0 uv0 uv0 uv0 uv2 uv2 uv2 uv2 uv2
; xmm1: uv4 uv4 uv4 uv4 ...
; xmm4: y0 0 y1 0 y2 0 y3 0 y4 0 y5 0 y6 0 y7 0

pmaddubsw xmm0, [const_uv_to_rgb_mul]
pmaddubsw xmm1, [const_uv_to_rgb_mul]
paddsw xmm0, [const_uv_to_rgb_add]
paddsw xmm1, [const_uv_to_rgb_add]
psraw xmm0, 6
psraw xmm1, 6

; r01 g01 b01 0 r23 g23 b23 0

pshufd xmm2, xmm0, 2+3*4+2*16+3*64
pshufd xmm0, xmm0, 0+1*4+0+16+1*64
pshufd xmm3, xmm1, 2+3*4+2*16+3*64
pshufd xmm1, xmm1, 0+1*4+0+16+1*64

; xmm0: r01 g01 b01 0 r01 g01 b01 0
; xmm2: r23 g23 b23 0 r23 g23 b23 0
; xmm1: r45 g45 b45 0 r45 g45 b45 0

paddsw xmm0, xmm4 ; add y
paddsw xmm1, xmm4 
paddsw xmm2, xmm4
paddsw xmm3, xmm4

packuswb xmm0, xmm2  ; pack with saturation into 0-255 range
packuswb xmm1, xmm3
pand xmm0, [const_d] ; zero out the alpha byte
pand xmm1, [const_d]
movntdq [edi], xmm0
movntdq [edi+16], xmm1
红ご颜醉 2024-10-16 14:11:02

如果您使用饱和添加,则查找表可以工作,但它们将您一次限制为 1 个像素,并且当缓存未命中时,内存查找会很慢。
3 pmaddubsw 工作正常,但该指令在 Core2 上速度很慢,并且在较旧的 Core2 上不可用。所以 4 个 pmul 可能效果更好。

The look up tables work if you use saturating adds, but they limit you to 1 pixel at a time and memory lookups are slow when they cache miss.
The 3 pmaddubsw works ok, but the instruction is slow on Core2 and not available on older one's. So 4 pmul's might work better.

在梵高的星空下 2024-10-16 14:11:02

鉴于每个组件的源数据只有 8 位,您是否尝试过一些简单的操作,例如:

uint32_t YtoRGBlookupTable[256] = {
    /* precomputed table (cut & pasted from a spreadsheet or something) */
};

uint32_t UtoRGBlookupTable[256] = {
    /* precomputed table (cut & pasted from a spreadsheet or something) */
};

uint32_t VtoRGBlookupTable[256] = {
    /* precomputed table (cut & pasted from a spreadsheet or something) */
};

while(i < something) {
    UVtemp = UtoRGBlookupTable[src->u0] + VtoRGBlookupTable[src->v0];
    dest[i] = YtoRGBlookupTable[src->y0] + UVtemp;
    dest[i+1] = YtoRGBlookupTable[src->y1] + UVtemp;
    UVtemp = UtoRGBlookupTable[src->u1] + VtoRGBlookupTable[src->v1];
    dest[i+2] = YtoRGBlookupTable[src->y2] + UVtemp;
    dest[i+3] = YtoRGBlookupTable[src->y3] + UVtemp;
    i += 4;
    src++;
}

D'oh - 抱歉。这是行不通的,因为你无法阻止绿色溢出到红色中,并且你需要单独处理绿色。

Given that the source data is only 8-bit per component, have you tried something simple, like:

uint32_t YtoRGBlookupTable[256] = {
    /* precomputed table (cut & pasted from a spreadsheet or something) */
};

uint32_t UtoRGBlookupTable[256] = {
    /* precomputed table (cut & pasted from a spreadsheet or something) */
};

uint32_t VtoRGBlookupTable[256] = {
    /* precomputed table (cut & pasted from a spreadsheet or something) */
};

while(i < something) {
    UVtemp = UtoRGBlookupTable[src->u0] + VtoRGBlookupTable[src->v0];
    dest[i] = YtoRGBlookupTable[src->y0] + UVtemp;
    dest[i+1] = YtoRGBlookupTable[src->y1] + UVtemp;
    UVtemp = UtoRGBlookupTable[src->u1] + VtoRGBlookupTable[src->v1];
    dest[i+2] = YtoRGBlookupTable[src->y2] + UVtemp;
    dest[i+3] = YtoRGBlookupTable[src->y3] + UVtemp;
    i += 4;
    src++;
}

D'oh - sorry. This won't work because you can't prevent the green from overflowing into the red, and you'd need to handle green separately.

独闯女儿国 2024-10-16 14:11:02

稍微更新一下使用 AVX2/查找表方法可以实现的功能,包括纯 C++ 实现,该实现对于大多数现代编译器来说足够简单,至少可以成功应用完整的 SSE4 矢量化。

AVX2 转换速度足够快,可在约 3 毫秒内将 4k YUV420 帧转换为 RGB,而自动矢量化帧仍可降至约 5 毫秒。
通过显式展开外循环仍然可以获得另一个加速,因为负载可以进一步向前拉。

与 Intel ipp 中的相应例程相比,这提供了超过 2 倍的加速。

#include <array>
#include <algorithm>
#include <cstdint>

#ifdef __AVX2__
    #include <immintrin.h>
    #include <emmintrin.h>
#endif

struct matrix_ycbcr_full
{
    static constexpr float matrix[3][4] = {
        {1.0f, 0.0f, 1.4f, -179.2f},
        {1.0f, -0.343f, -0.711f, 134.912f},
        {1.0f, 1.765f, 0.0f, -225.92f}
    };
};

template<typename T>
struct yuv_lut_converter
{
    static constexpr size_t bias = 2;

    using offset_t = std::array<int16_t, 4>;
    using lookup_t = std::array<std::array<offset_t, 256>, 3>;

    static constexpr lookup_t generate_lut() noexcept
    {
        constexpr T matrix = {};
        lookup_t tmp = {};

        for (size_t i = 0; i < 256; ++i)
        {
            // Apply conversion matrix for Y, U and V input channels.
            for (size_t input_channel = 0; input_channel < 3; ++input_channel)
            {
                // For each of R, G and B output channels.
                for (size_t output_channel = 0; output_channel < 3; ++output_channel)
                {
                }
            }
            // For each of R, G and B output channels.
            for (size_t output_channel = 0; output_channel < 3; ++output_channel)
            {
                // Luminance is treated as unsigned integer.
                // Apply offset from 4th input (const 1) as offset to Y channel input.
                tmp[0][i][output_channel] =
                    static_cast<int16_t>(matrix.matrix[output_channel][0] * static_cast<float>(i << bias) +
                                         matrix.matrix[output_channel][3] * static_cast<float>(1 << bias));

                // Chroma is treated as unsigned integer for YUV and YCbCr.
                tmp[1][i][output_channel] =
                    static_cast<int16_t>(matrix.matrix[output_channel][1] * static_cast<float>(i << bias));
                tmp[2][i][output_channel] =
                    static_cast<int16_t>(matrix.matrix[output_channel][2] * static_cast<float>(i << bias));
            }
        }
        return tmp;
    }

    alignas(8) const lookup_t table = generate_lut();
    
    // Current day compilers will be able to utilize SSE4 for this method.
    inline void lookup(
        const uint8_t& __restrict y_0,
        const uint8_t& __restrict y_1,
        const uint8_t& __restrict u_0,
        const uint8_t& __restrict v_0,
        uint8_t* const __restrict rgb) const noexcept
    {

        // Process as unsigned integer - the rollover in the chroma channel is already handled.
        const auto& __restrict lut_y_0 = table[0][y_0];
        const auto& __restrict lut_y_1 = table[0][y_1];
        const auto& __restrict lut_u_0 = table[1][u_0];
        const auto& __restrict lut_v_0 = table[2][v_0];

        offset_t tmp0, tmp1;
        for (size_t output_channel = 0; output_channel < 4; ++output_channel)
        {
            const auto& tmpUV = static_cast<int16_t>(lut_u_0[output_channel] + lut_v_0[output_channel]);
            tmp0[output_channel] = std::max<int16_t>(std::min<int16_t>(static_cast<int16_t>(tmpUV + lut_y_0[output_channel]) >> bias, 255), 0);
            tmp1[output_channel] = std::max<int16_t>(std::min<int16_t>(static_cast<int16_t>(tmpUV + lut_y_1[output_channel]) >> bias, 255), 0);
        }

        for (size_t output_channel = 0; output_channel < 3; ++output_channel)
        {
            *(rgb + 0 + output_channel) = static_cast<uint8_t>(tmp0[output_channel]);
            *(rgb + 3 + output_channel) = static_cast<uint8_t>(tmp1[output_channel]);
        }
    }

#ifdef __AVX2__
    // Explicit AVX2 implementation processing twice as much data in the same number of instructions.
    inline void lookup_avx2(
        const uint8_t& __restrict y_0,
        const uint8_t& __restrict y_1,
        const uint8_t& __restrict y_2,
        const uint8_t& __restrict y_3,
        const uint8_t& __restrict u_0,
        const uint8_t& __restrict v_0,
        const uint8_t& __restrict u_1,
        const uint8_t& __restrict v_1,
        uint8_t* const __restrict rgb) const noexcept
    {
        // Avoid _mm256_i32gather_epi64.
        // Hits a bug on Intel CPUs where it never hits 1st level TLB, only 2nd level TLB.
#if 0
        __m256i input_y = _mm256_i32gather_epi64(
            reinterpret_cast<const long long * __restrict>(&table),
            _mm_setr_epi32(y_0, y_1, y_2, y_3), 8);
        __m256i input_u_v = _mm256_i32gather_epi64(
            reinterpret_cast<const long long * __restrict>(&table),
            _mm_add_epi32(_mm_setr_epi32(u_0, v_0, u_1, v_1), _mm_setr_epi32(256, 512, 256, 512)), 8);
#else
        // Loads are being done by scalar ports, scales just as good as the gather instruction should have.
        __m256i input_y = _mm256_setr_epi64x(
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_0].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_1].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_2].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_3].data()));
        const __m256i input_u_v = _mm256_setr_epi64x(
            *reinterpret_cast<const int64_t* __restrict>(table[1][u_0].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[2][v_0].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[1][u_1].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[2][v_1].data()));
#endif
        // Swap U and V.
        const __m256i input_v_u = _mm256_shuffle_epi32(input_u_v, 0b01'00'11'10);
        // Sum U and V into alternating Y channels.
        input_y = _mm256_add_epi16(input_y, input_u_v);
        // Sum up the remaining bits.
        input_y = _mm256_add_epi16(input_y, input_v_u);
        // Strip the bias.
        input_y = _mm256_srli_epi16(input_y, bias);

        const auto min = _mm256_setzero_si256();
        const auto max = _mm256_set1_epi16(0xFF);
        // Reduce to valid range.
        input_y = _mm256_min_epi16(_mm256_max_epi16(input_y, min), max);

        // input_y now contains 8bit RxGxBxxx - pack down to 8bit RGB.
        // There is a total of 12 good bytes in there, get rid of padding first.
        const auto shuffle = _mm256_setr_epi8(
            /* 6 valid */ 0, 2, 4, 8, 10, 12,
            /* 10 empty */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
            /* 6 empty */ -1, -1, -1, -1, -1, -1,
            /* 6 valid */ 0, 2, 4, 8, 10, 12,
            /* 4 empty */ -1, -1, -1, -1);
        input_y = _mm256_shuffle_epi8(input_y, shuffle);

        // Align all good 12 bytes in one 128bit range.
        __m128i result = _mm_or_si128(_mm256_extracti128_si256(input_y, 0), _mm256_extracti128_si256(input_y, 1));
        // Non-temporal streaming stores.
        _mm_stream_si64(reinterpret_cast<long long*>(rgb), _mm_cvtsi128_si64(result));
        _mm_stream_si32(reinterpret_cast<int*>(rgb + 8), _mm_extract_epi32(result, 2));
    }
#endif
};

int main()
{
    static constexpr yuv_lut_converter<matrix_ycbcr_full> converter;
    uint8_t rgb_out[12] = {};
    uint8_t rgb_out_avx[12] = {};

    const uint8_t line[8] = {0x81, 0x52, 0x85, 0x4F, 0x80, 0x53, 0x85, 0x53};

    converter.lookup(line[1], line[3], line[0], line[2], rgb_out);
    converter.lookup(line[5], line[7], line[4], line[6], rgb_out + 6);

    converter.lookup_avx2(line[1], line[3], line[5], line[7], line[0], line[2], line[4], line[6], rgb_out_avx);

    _mm_sfence();

    for(size_t i = 0; i < 12; ++i)
    {
        if(rgb_out[i] != rgb_out_avx[i])
        {
            return 1;
        }
    }
    return 0;
}

Example on Goldbolt

The key part is the bias in the 16 bit LUT, which provides 2 digits of additional precision and thereby avoids使用简单的 8 位 LUT 会得到的舍入误差。 LUT 仍然足够小,可以驻留在 L1 缓存中。

使用非临时存储(也可以选择加载 - 在此代码片段中未显示)对于帮助保持 L1 缓存清除实际图像数据至关重要,从而避免 LUT 被逐出。

这对于扩展 10 位输入和输出来说也是微不足道的,并且具有相同的精度保证。超出此范围将超出可用的 L1 缓存大小,因此性能最终会下降。

预编译 LUT 只是“因为为什么不”而完成,但在运行时构建 LUT(例如根据需要调整颜色扭曲)同样可行。

A little update with what you can achieve using an AVX2 / lookup table approach, including a pure C++ implementation which is simple enough for most of the modern compilers to successfully apply at least full SSE4 vectorization.

The AVX2 conversion is fast enough to convert a 4k YUV420 frame to RGB in approximately ~3ms, while the auto-vectorized one still gets down to ~5ms.
Another speedup is still obtainable by explicitly unrolling an outer loop, as the loads can be pulled further ahead.

Compared to the corresponding routine in Intels ipp this gives more than a factor 2x speedup.

#include <array>
#include <algorithm>
#include <cstdint>

#ifdef __AVX2__
    #include <immintrin.h>
    #include <emmintrin.h>
#endif

struct matrix_ycbcr_full
{
    static constexpr float matrix[3][4] = {
        {1.0f, 0.0f, 1.4f, -179.2f},
        {1.0f, -0.343f, -0.711f, 134.912f},
        {1.0f, 1.765f, 0.0f, -225.92f}
    };
};

template<typename T>
struct yuv_lut_converter
{
    static constexpr size_t bias = 2;

    using offset_t = std::array<int16_t, 4>;
    using lookup_t = std::array<std::array<offset_t, 256>, 3>;

    static constexpr lookup_t generate_lut() noexcept
    {
        constexpr T matrix = {};
        lookup_t tmp = {};

        for (size_t i = 0; i < 256; ++i)
        {
            // Apply conversion matrix for Y, U and V input channels.
            for (size_t input_channel = 0; input_channel < 3; ++input_channel)
            {
                // For each of R, G and B output channels.
                for (size_t output_channel = 0; output_channel < 3; ++output_channel)
                {
                }
            }
            // For each of R, G and B output channels.
            for (size_t output_channel = 0; output_channel < 3; ++output_channel)
            {
                // Luminance is treated as unsigned integer.
                // Apply offset from 4th input (const 1) as offset to Y channel input.
                tmp[0][i][output_channel] =
                    static_cast<int16_t>(matrix.matrix[output_channel][0] * static_cast<float>(i << bias) +
                                         matrix.matrix[output_channel][3] * static_cast<float>(1 << bias));

                // Chroma is treated as unsigned integer for YUV and YCbCr.
                tmp[1][i][output_channel] =
                    static_cast<int16_t>(matrix.matrix[output_channel][1] * static_cast<float>(i << bias));
                tmp[2][i][output_channel] =
                    static_cast<int16_t>(matrix.matrix[output_channel][2] * static_cast<float>(i << bias));
            }
        }
        return tmp;
    }

    alignas(8) const lookup_t table = generate_lut();
    
    // Current day compilers will be able to utilize SSE4 for this method.
    inline void lookup(
        const uint8_t& __restrict y_0,
        const uint8_t& __restrict y_1,
        const uint8_t& __restrict u_0,
        const uint8_t& __restrict v_0,
        uint8_t* const __restrict rgb) const noexcept
    {

        // Process as unsigned integer - the rollover in the chroma channel is already handled.
        const auto& __restrict lut_y_0 = table[0][y_0];
        const auto& __restrict lut_y_1 = table[0][y_1];
        const auto& __restrict lut_u_0 = table[1][u_0];
        const auto& __restrict lut_v_0 = table[2][v_0];

        offset_t tmp0, tmp1;
        for (size_t output_channel = 0; output_channel < 4; ++output_channel)
        {
            const auto& tmpUV = static_cast<int16_t>(lut_u_0[output_channel] + lut_v_0[output_channel]);
            tmp0[output_channel] = std::max<int16_t>(std::min<int16_t>(static_cast<int16_t>(tmpUV + lut_y_0[output_channel]) >> bias, 255), 0);
            tmp1[output_channel] = std::max<int16_t>(std::min<int16_t>(static_cast<int16_t>(tmpUV + lut_y_1[output_channel]) >> bias, 255), 0);
        }

        for (size_t output_channel = 0; output_channel < 3; ++output_channel)
        {
            *(rgb + 0 + output_channel) = static_cast<uint8_t>(tmp0[output_channel]);
            *(rgb + 3 + output_channel) = static_cast<uint8_t>(tmp1[output_channel]);
        }
    }

#ifdef __AVX2__
    // Explicit AVX2 implementation processing twice as much data in the same number of instructions.
    inline void lookup_avx2(
        const uint8_t& __restrict y_0,
        const uint8_t& __restrict y_1,
        const uint8_t& __restrict y_2,
        const uint8_t& __restrict y_3,
        const uint8_t& __restrict u_0,
        const uint8_t& __restrict v_0,
        const uint8_t& __restrict u_1,
        const uint8_t& __restrict v_1,
        uint8_t* const __restrict rgb) const noexcept
    {
        // Avoid _mm256_i32gather_epi64.
        // Hits a bug on Intel CPUs where it never hits 1st level TLB, only 2nd level TLB.
#if 0
        __m256i input_y = _mm256_i32gather_epi64(
            reinterpret_cast<const long long * __restrict>(&table),
            _mm_setr_epi32(y_0, y_1, y_2, y_3), 8);
        __m256i input_u_v = _mm256_i32gather_epi64(
            reinterpret_cast<const long long * __restrict>(&table),
            _mm_add_epi32(_mm_setr_epi32(u_0, v_0, u_1, v_1), _mm_setr_epi32(256, 512, 256, 512)), 8);
#else
        // Loads are being done by scalar ports, scales just as good as the gather instruction should have.
        __m256i input_y = _mm256_setr_epi64x(
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_0].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_1].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_2].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[0][y_3].data()));
        const __m256i input_u_v = _mm256_setr_epi64x(
            *reinterpret_cast<const int64_t* __restrict>(table[1][u_0].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[2][v_0].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[1][u_1].data()),
            *reinterpret_cast<const int64_t* __restrict>(table[2][v_1].data()));
#endif
        // Swap U and V.
        const __m256i input_v_u = _mm256_shuffle_epi32(input_u_v, 0b01'00'11'10);
        // Sum U and V into alternating Y channels.
        input_y = _mm256_add_epi16(input_y, input_u_v);
        // Sum up the remaining bits.
        input_y = _mm256_add_epi16(input_y, input_v_u);
        // Strip the bias.
        input_y = _mm256_srli_epi16(input_y, bias);

        const auto min = _mm256_setzero_si256();
        const auto max = _mm256_set1_epi16(0xFF);
        // Reduce to valid range.
        input_y = _mm256_min_epi16(_mm256_max_epi16(input_y, min), max);

        // input_y now contains 8bit RxGxBxxx - pack down to 8bit RGB.
        // There is a total of 12 good bytes in there, get rid of padding first.
        const auto shuffle = _mm256_setr_epi8(
            /* 6 valid */ 0, 2, 4, 8, 10, 12,
            /* 10 empty */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
            /* 6 empty */ -1, -1, -1, -1, -1, -1,
            /* 6 valid */ 0, 2, 4, 8, 10, 12,
            /* 4 empty */ -1, -1, -1, -1);
        input_y = _mm256_shuffle_epi8(input_y, shuffle);

        // Align all good 12 bytes in one 128bit range.
        __m128i result = _mm_or_si128(_mm256_extracti128_si256(input_y, 0), _mm256_extracti128_si256(input_y, 1));
        // Non-temporal streaming stores.
        _mm_stream_si64(reinterpret_cast<long long*>(rgb), _mm_cvtsi128_si64(result));
        _mm_stream_si32(reinterpret_cast<int*>(rgb + 8), _mm_extract_epi32(result, 2));
    }
#endif
};

int main()
{
    static constexpr yuv_lut_converter<matrix_ycbcr_full> converter;
    uint8_t rgb_out[12] = {};
    uint8_t rgb_out_avx[12] = {};

    const uint8_t line[8] = {0x81, 0x52, 0x85, 0x4F, 0x80, 0x53, 0x85, 0x53};

    converter.lookup(line[1], line[3], line[0], line[2], rgb_out);
    converter.lookup(line[5], line[7], line[4], line[6], rgb_out + 6);

    converter.lookup_avx2(line[1], line[3], line[5], line[7], line[0], line[2], line[4], line[6], rgb_out_avx);

    _mm_sfence();

    for(size_t i = 0; i < 12; ++i)
    {
        if(rgb_out[i] != rgb_out_avx[i])
        {
            return 1;
        }
    }
    return 0;
}

Example on Goldbolt

The key part is the bias in the 16 bit LUT, which provides 2 digits of additional precision and thereby avoids the rounding errors you would get with a simple 8 bit LUT. The LUT is still small enough to stay resident within the L1 cache.

The use of non-temporal stores (and optionally also loads - not shown in this snippet) is essential to help keeping the L1 cache clean of the actual image data, and thereby avoiding the eviction of the LUT.

This is also trivial to extend for 10 bit input and outputs with the same guarantees for precision. Going beyond that will exceed the available L1 cache sizes though, so performance does degrade eventually.

Pre-compiling the LUT is just done "because why not", but building the LUT at runtime (and e.g. tweaking the color twist as desired) is equally viable.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文