帮助我改进更多 SSE2 代码
我正在寻求一些帮助来改进 core2 cpu 上的双线性缩放 sse2 代码,
在我的 Atom N270 和 i7 上,此代码比 mmx 代码快大约 2 倍。但在 core2 cpu 下它只等于 mmx 代码。
代码如下
void ConversionProcess::convert_SSE2(BBitmap *from, BBitmap *to)
{
uint32 fromBPR, toBPR, fromBPRDIV4, x, y, yr, xr;
ULLint start = rdtsc();
ULLint stop;
if (from && to) {
uint32 width, height;
width = from->Bounds().IntegerWidth() + 1;
height = from->Bounds().IntegerHeight() + 1;
uint32 toWidth, toHeight;
toWidth = to->Bounds().IntegerWidth() + 1;
toHeight = to->Bounds().IntegerHeight() + 1;
fromBPR = from->BytesPerRow();
fromBPRDIV4 = fromBPR >> 2;
toBPR = to->BytesPerRow();
uint32 x_ratio = ((width-1) << 7) / toWidth ;
uint32 y_ratio = ((height-1) << 7) / toHeight ;
uint8* toPtr = (uint8*)to->Bits();
uint8* fromPtr1 = (uint8*)from->Bits();
uint8* fromPtr2 = (uint8*)from->Bits() + fromBPR;
struct FilterInfo {
uint16 one_minus_diff; // one minus diff
uint16 diff; // diff value used to calculate the weights used to average the pixels
uint16 one_minus_diff_rep; // one minus diff repeated
uint16 diff_rep; // diff value used to calculate the weights used to average the pixels repeated
};
FilterInfo *xWeights = (FilterInfo *)memalign(16, toWidth * 8);
FilterInfo *yWeights = (FilterInfo *)memalign(16, toHeight * 8);
uint32 *xIndexes = (uint32 *)memalign(16, (toWidth+2) * 4); // will overread by 2 index
uint32 *yIndexes = (uint32 *)memalign(16, toHeight * 4);
x = 0;
for (uint32 j=0;j < toWidth;j++) {
xr = x >> 7;
xWeights[j].diff = x - (xr << 7);
xWeights[j].one_minus_diff = 127 - xWeights[j].diff;
xWeights[j].one_minus_diff_rep = xWeights[j].one_minus_diff;
xWeights[j].diff_rep = xWeights[j].diff;
xIndexes[j] = xr << 2;
x += x_ratio;
}
y = 0;
for (uint32 j=0;j < toHeight; j++) {
yr = y >> 7;
yWeights[j].diff = y - (yr << 7);
yWeights[j].one_minus_diff = 127 - yWeights[j].diff;
yIndexes[j] = (yr * fromBPR);
y += y_ratio;
}
for (uint32 i=0;i < toHeight; i++) {
_ScaleSSE2X2(toPtr, fromPtr1 + yIndexes[i], fromPtr2 + yIndexes[i], xIndexes, xWeights, &yWeights[i], toWidth);
toPtr += toBPR;
}
free(xWeights);
free(yWeights);
free(xIndexes);
free(yIndexes);
stop = rdtsc() - start;
if (stop < timeTaken) {
timeTaken = stop;
}
}
}
;
; Copyright (C) 2011 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;
; A rather unoptimised bilinear scaler
%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro
SECTION .data align=16
RGB_AND db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
; void _ScaleSSE2X2(void *toPtr, void *fromPtr1, void *fromPtr2, void* xIndexPtr, void *xWeightPtr, void *yWeightPtr, uint32 length);
length equ ebp+32
yWeightPtr equ ebp+28
xWeightPtr equ ebp+24
xIndexPtr equ ebp+20
fromPtr2 equ ebp+16
fromPtr1 equ ebp+12
toPtr equ ebp+8
SECTION .text align=16
cglobal ScaleSSE2X2
; reserve registers. eax, ecx, edx automatically available
push ebp
mov ebp, esp
push ebx ; yWeights, xIndexPtr
push edi ; scratch
push esi ; fromPtr3
mov esi, [fromPtr1]
mov edx, [fromPtr2]
mov eax, [xWeightPtr]
mov ebx, [yWeightPtr]
mov ecx, [length]
; calculate y weights and cache
movd xmm7, [ebx] ; get 1-yDiff and yDiff
pshuflw xmm7, xmm7, 01010000b ; 1-yDiff, 1-yDiff, yDiff, yDiff
pshufd xmm7, xmm7, 01000100b ; duplicate
mov ebx, [xIndexPtr]
push ebp ; reuse frame ptr for toPtr
mov ebp, [toPtr] ; Cannot use parameter refs anymore
shr ecx,1
; calculate first index
mov edi, [ebx] ; index
align 16
REPEATLOOPX2:
; load first and second set of weights into xmm3
movdqa xmm3, [eax] ; get 1-xDiff, xDiff, 1-xDiff, xDiff
pmullw xmm3, xmm7 ; calculate F1, F2, F3, F4 (2)
add eax, 16
; load first set of source pixels
movq xmm0, [esi+edi] ; xmm0 = fromPtr1 + index | fromPtr1 + index + 4
movq xmm1, [edx+edi] ; xmm1 = fromPtr2 + index | fromPtr2 + index + 4
punpcklqdq xmm0, xmm1 ; combine all 4 pixels into xmm0
sub edi, [ebx+4] ; if the x index is the same then skip the second load
jz SKIP
; calculate second index
mov edi, [ebx+4] ; index
; load second set of source pixels
movq xmm4, [esi+edi] ; xmm4 = fromPtr1 + index | fromPtr1 + index + 4
movq xmm5, [edx+edi] ; xmm5 = fromPtr2 + index | fromPtr2 + index + 4
punpcklqdq xmm4, xmm5 ; combine all 4 pixels into xmm4
movdqa xmm1, xmm0 ; copy to xmm1, xmm2
pshufd xmm2, xmm0, 0xE4
movdqa xmm5, xmm4 ; copy to xmm1, xmm2
pshufd xmm6, xmm4, 0xE4
jmp NEXT
align 16
SKIP:
movdqa xmm1, xmm0 ; copy to xmm1, xmm2
pshufd xmm2, xmm0, 0xE4
movdqa xmm4, xmm0 ; copy first pixel set xmm0 to second pixel set xmm4
pshufd xmm5, xmm4, 0xE4 ; copy to xmm4, xmm6
movdqa xmm6, xmm4
NEXT:
; prefetchnta [edx+edi+16]
add ebx, 8
; calculate dest rgb values using color = a * F1 + b * F2 + c * F3 + d * F4
; extract b from both sets of pixels and combine into a single reg
pand xmm0, [RGB_AND] ; clear all but r values leaving b000
pand xmm4, [RGB_AND] ; clear all but r values leaving b000
packssdw xmm0, xmm4 ; pack down to 16 bit values
movdqa xmm4, [RGB_AND] ; xmm4 is now free
pmaddwd xmm0, xmm3 ; multiply and add to get temp1 = a * F1 + b * F2, temp2 = c * F3 + d * F4
; extract g
psrld xmm1, 8 ; rotate g to low bytes
pand xmm1, xmm4 ; extract g values g000
psrld xmm5, 8 ; rotate g to low bytes
pand xmm5, xmm4 ; extract g values g000
packssdw xmm1, xmm5 ; pack down to 16 bit values
pmaddwd xmm1, xmm3 ; multiply and add
; extract r
psrld xmm2, 16 ; rotate b to low bytes
pand xmm2, xmm4 ; extract b values b000
psrld xmm6, 16 ; rotate b to low bytes
pand xmm6, xmm4 ; extract b values b000
packssdw xmm2, xmm6 ; pack down to 16 bit values
pmaddwd xmm2, xmm3 ; multiply and add
; Add temp1 and temp2 leaving us with rrrr xxxx rrrr xxxx
psrld xmm0, 14 ; scale back to range
pshufd xmm3, xmm0, 00110001b ; extract temp2
paddd xmm0, xmm3 ; add back to temp1
psrld xmm1, 14 ; scale back to range
pshufd xmm3, xmm1, 00110001b
paddd xmm1, xmm3 ; add
psrld xmm2, 14 ; scale back to range
pshufd xmm3, xmm2, 00110001b
paddd xmm2, xmm3 ; add
; recombine into 2 rgba values
pslld xmm1, 8
por xmm0, xmm1
pslld xmm2, 16
por xmm0, xmm2
pshufd xmm0, xmm0, 00001000b ; shuffle down
movq [ebp], xmm0 ; output 32bit * 2
add ebp, 8
mov edi, [ebx] ; index
sub ecx, 1
jnz REPEATLOOPX2
; Cleanup
pop ebp
pop esi
pop edi
pop ebx
mov esp, ebp
pop ebp
ret
I am looking for some help to improve this bilinear scaling sse2 code on core2 cpus
On my Atom N270 and on an i7 this code is about 2x faster than the mmx code. But under core2 cpus it is only equal to the mmx code.
Code follows
void ConversionProcess::convert_SSE2(BBitmap *from, BBitmap *to)
{
uint32 fromBPR, toBPR, fromBPRDIV4, x, y, yr, xr;
ULLint start = rdtsc();
ULLint stop;
if (from && to) {
uint32 width, height;
width = from->Bounds().IntegerWidth() + 1;
height = from->Bounds().IntegerHeight() + 1;
uint32 toWidth, toHeight;
toWidth = to->Bounds().IntegerWidth() + 1;
toHeight = to->Bounds().IntegerHeight() + 1;
fromBPR = from->BytesPerRow();
fromBPRDIV4 = fromBPR >> 2;
toBPR = to->BytesPerRow();
uint32 x_ratio = ((width-1) << 7) / toWidth ;
uint32 y_ratio = ((height-1) << 7) / toHeight ;
uint8* toPtr = (uint8*)to->Bits();
uint8* fromPtr1 = (uint8*)from->Bits();
uint8* fromPtr2 = (uint8*)from->Bits() + fromBPR;
struct FilterInfo {
uint16 one_minus_diff; // one minus diff
uint16 diff; // diff value used to calculate the weights used to average the pixels
uint16 one_minus_diff_rep; // one minus diff repeated
uint16 diff_rep; // diff value used to calculate the weights used to average the pixels repeated
};
FilterInfo *xWeights = (FilterInfo *)memalign(16, toWidth * 8);
FilterInfo *yWeights = (FilterInfo *)memalign(16, toHeight * 8);
uint32 *xIndexes = (uint32 *)memalign(16, (toWidth+2) * 4); // will overread by 2 index
uint32 *yIndexes = (uint32 *)memalign(16, toHeight * 4);
x = 0;
for (uint32 j=0;j < toWidth;j++) {
xr = x >> 7;
xWeights[j].diff = x - (xr << 7);
xWeights[j].one_minus_diff = 127 - xWeights[j].diff;
xWeights[j].one_minus_diff_rep = xWeights[j].one_minus_diff;
xWeights[j].diff_rep = xWeights[j].diff;
xIndexes[j] = xr << 2;
x += x_ratio;
}
y = 0;
for (uint32 j=0;j < toHeight; j++) {
yr = y >> 7;
yWeights[j].diff = y - (yr << 7);
yWeights[j].one_minus_diff = 127 - yWeights[j].diff;
yIndexes[j] = (yr * fromBPR);
y += y_ratio;
}
for (uint32 i=0;i < toHeight; i++) {
_ScaleSSE2X2(toPtr, fromPtr1 + yIndexes[i], fromPtr2 + yIndexes[i], xIndexes, xWeights, &yWeights[i], toWidth);
toPtr += toBPR;
}
free(xWeights);
free(yWeights);
free(xIndexes);
free(yIndexes);
stop = rdtsc() - start;
if (stop < timeTaken) {
timeTaken = stop;
}
}
}
;
; Copyright (C) 2011 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;
; A rather unoptimised bilinear scaler
%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro
SECTION .data align=16
RGB_AND db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
; void _ScaleSSE2X2(void *toPtr, void *fromPtr1, void *fromPtr2, void* xIndexPtr, void *xWeightPtr, void *yWeightPtr, uint32 length);
length equ ebp+32
yWeightPtr equ ebp+28
xWeightPtr equ ebp+24
xIndexPtr equ ebp+20
fromPtr2 equ ebp+16
fromPtr1 equ ebp+12
toPtr equ ebp+8
SECTION .text align=16
cglobal ScaleSSE2X2
; reserve registers. eax, ecx, edx automatically available
push ebp
mov ebp, esp
push ebx ; yWeights, xIndexPtr
push edi ; scratch
push esi ; fromPtr3
mov esi, [fromPtr1]
mov edx, [fromPtr2]
mov eax, [xWeightPtr]
mov ebx, [yWeightPtr]
mov ecx, [length]
; calculate y weights and cache
movd xmm7, [ebx] ; get 1-yDiff and yDiff
pshuflw xmm7, xmm7, 01010000b ; 1-yDiff, 1-yDiff, yDiff, yDiff
pshufd xmm7, xmm7, 01000100b ; duplicate
mov ebx, [xIndexPtr]
push ebp ; reuse frame ptr for toPtr
mov ebp, [toPtr] ; Cannot use parameter refs anymore
shr ecx,1
; calculate first index
mov edi, [ebx] ; index
align 16
REPEATLOOPX2:
; load first and second set of weights into xmm3
movdqa xmm3, [eax] ; get 1-xDiff, xDiff, 1-xDiff, xDiff
pmullw xmm3, xmm7 ; calculate F1, F2, F3, F4 (2)
add eax, 16
; load first set of source pixels
movq xmm0, [esi+edi] ; xmm0 = fromPtr1 + index | fromPtr1 + index + 4
movq xmm1, [edx+edi] ; xmm1 = fromPtr2 + index | fromPtr2 + index + 4
punpcklqdq xmm0, xmm1 ; combine all 4 pixels into xmm0
sub edi, [ebx+4] ; if the x index is the same then skip the second load
jz SKIP
; calculate second index
mov edi, [ebx+4] ; index
; load second set of source pixels
movq xmm4, [esi+edi] ; xmm4 = fromPtr1 + index | fromPtr1 + index + 4
movq xmm5, [edx+edi] ; xmm5 = fromPtr2 + index | fromPtr2 + index + 4
punpcklqdq xmm4, xmm5 ; combine all 4 pixels into xmm4
movdqa xmm1, xmm0 ; copy to xmm1, xmm2
pshufd xmm2, xmm0, 0xE4
movdqa xmm5, xmm4 ; copy to xmm1, xmm2
pshufd xmm6, xmm4, 0xE4
jmp NEXT
align 16
SKIP:
movdqa xmm1, xmm0 ; copy to xmm1, xmm2
pshufd xmm2, xmm0, 0xE4
movdqa xmm4, xmm0 ; copy first pixel set xmm0 to second pixel set xmm4
pshufd xmm5, xmm4, 0xE4 ; copy to xmm4, xmm6
movdqa xmm6, xmm4
NEXT:
; prefetchnta [edx+edi+16]
add ebx, 8
; calculate dest rgb values using color = a * F1 + b * F2 + c * F3 + d * F4
; extract b from both sets of pixels and combine into a single reg
pand xmm0, [RGB_AND] ; clear all but r values leaving b000
pand xmm4, [RGB_AND] ; clear all but r values leaving b000
packssdw xmm0, xmm4 ; pack down to 16 bit values
movdqa xmm4, [RGB_AND] ; xmm4 is now free
pmaddwd xmm0, xmm3 ; multiply and add to get temp1 = a * F1 + b * F2, temp2 = c * F3 + d * F4
; extract g
psrld xmm1, 8 ; rotate g to low bytes
pand xmm1, xmm4 ; extract g values g000
psrld xmm5, 8 ; rotate g to low bytes
pand xmm5, xmm4 ; extract g values g000
packssdw xmm1, xmm5 ; pack down to 16 bit values
pmaddwd xmm1, xmm3 ; multiply and add
; extract r
psrld xmm2, 16 ; rotate b to low bytes
pand xmm2, xmm4 ; extract b values b000
psrld xmm6, 16 ; rotate b to low bytes
pand xmm6, xmm4 ; extract b values b000
packssdw xmm2, xmm6 ; pack down to 16 bit values
pmaddwd xmm2, xmm3 ; multiply and add
; Add temp1 and temp2 leaving us with rrrr xxxx rrrr xxxx
psrld xmm0, 14 ; scale back to range
pshufd xmm3, xmm0, 00110001b ; extract temp2
paddd xmm0, xmm3 ; add back to temp1
psrld xmm1, 14 ; scale back to range
pshufd xmm3, xmm1, 00110001b
paddd xmm1, xmm3 ; add
psrld xmm2, 14 ; scale back to range
pshufd xmm3, xmm2, 00110001b
paddd xmm2, xmm3 ; add
; recombine into 2 rgba values
pslld xmm1, 8
por xmm0, xmm1
pslld xmm2, 16
por xmm0, xmm2
pshufd xmm0, xmm0, 00001000b ; shuffle down
movq [ebp], xmm0 ; output 32bit * 2
add ebp, 8
mov edi, [ebx] ; index
sub ecx, 1
jnz REPEATLOOPX2
; Cleanup
pop ebp
pop esi
pop edi
pop ebx
mov esp, ebp
pop ebp
ret
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
两个建议:
在 Core 2 上的一个不错的分析器下(例如 Zoom)在测试工具中运行此代码,以查看位置热点和依赖项/其他停顿
使用内在函数重新编写 SIMD 代码,然后让编译器处理寄存器分配、指令调度和其他优化 - 一个不错的编译器,例如 ICC,甚至 gcc会比你的手工编码汇编做得更好。此外,您还可以针对不同的 x86 CPU 系列重新定位,而无需重新编写代码。
Two suggestions:
run this code in a test harness under a decent profiler on Core 2 (e.g. Zoom) to see where the hotspots and dependency/other stalls are
re-write the SIMD code using intrinsics and then let the compiler handle register allocation, instruction scheduling and other optimisations - a decent compiler such as ICC, or even gcc will do a lot better job than your hand-coded assembly. And as a bonus you can also re-target for different x86 CPU families without having to re-write your code.