在 SSE2/SSSE3 上转置 8 个 16 位元素寄存器

发布于 2024-08-26 18:09:16 字数 3932 浏览 9 评论 0原文

（我是 SSE/asm 的新手，如果这是明显或多余的，我深表歉意）

是否有更好的方法来转置包含 16 位值的 8 个 SSE 寄存器，而不是执行 24 unpck[lh]ps 和 8/16+ 洗牌并使用8个额外的寄存器？（注意使用最多 SSSE 3 指令、Intel Merom，又名缺少 SSE4 的 BLEND*。）

假设您有寄存器 v[0-7] 并使用 t0-t7 作为辅助寄存器。在伪内在代码中：

/* Phase 1: process lower parts of the registers */
/* Level 1: work first part of the vectors */
/*   v[0]  A0 A1 A2 A3 A4 A5 A6 A7
**   v[1]  B0 B1 B2 B3 B4 B5 B6 B7
**   v[2]  C0 C1 C2 C3 C4 C5 C6 C7
**   v[3]  D0 D1 D2 D3 D4 D5 D6 D7
**   v[4]  E0 E1 E2 E3 E4 E5 E6 E7
**   v[5]  F0 F1 F2 F3 F4 F5 F6 F7
**   v[6]  G0 G1 G2 G3 G4 G5 G6 G7
**   v[7]  H0 H1 H2 H3 H4 H5 H6 H7 */
t0 = unpcklps (v[0], v[1]); /* Extract first half interleaving */
t1 = unpcklps (v[2], v[3]); /* Extract first half interleaving */
t2 = unpcklps (v[4], v[5]); /* Extract first half interleaving */
t3 = unpcklps (v[6], v[7]); /* Extract first half interleaving */
t0 = pshufhw (t0, 0xD8); /* Flip middle 2 high */
t0 = pshuflw (t0, 0xD8); /* Flip middle 2 low */
t1 = pshufhw (t1, 0xD8); /* Flip middle 2 high */
t1 = pshuflw (t1, 0xD8); /* Flip middle 2 low */
t2 = pshufhw (t2, 0xD8); /* Flip middle 2 high */
t2 = pshuflw (t2, 0xD8); /* Flip middle 2 low */
t3 = pshufhw (t3, 0xD8); /* Flip middle 2 high */
t3 = pshuflw (t3, 0xD8); /* Flip middle 2 low */
/*   t0   A0 B0 A1 B1 A2 B2 A3 B3  (A B - 0 1 2 3)
**   t1   C0 D0 C1 D1 C2 D2 C3 D3  (C D - 0 1 2 3)
**   t2   E0 F0 E1 F1 E2 F2 E3 F3  (E F - 0 1 2 3)
**   t3   G0 H0 G1 H1 G2 H2 G3 H3  (G H - 0 1 2 3) */
/* L2 */
t4 = unpcklps (t0, t1);
t5 = unpcklps (t2, t3);
t6 = unpckhps (t0, t1);
t7 = unpckhps (t2, t3);
/*   t4   A0 B0 C0 D0 A1 B1 C1 D1 (A B C D - 0 1)
**   t5   E0 F0 G0 H0 E1 F1 G1 H1 (E F G H - 0 1)
**   t6   A2 B2 C2 D2 A3 B3 C3 D3 (A B C D - 2 3)
**   t7   E2 F2 G2 H2 E3 F3 G3 H3 (E F G H - 2 3) */
/* Phase 2: same with higher parts of the registers */
/*   A    A0 A1 A2 A3 A4 A5 A6 A7
**   B    B0 B1 B2 B3 B4 B5 B6 B7
**   C    C0 C1 C2 C3 C4 C5 C6 C7
**   D    D0 D1 D2 D3 D4 D5 D6 D7
**   E    E0 E1 E2 E3 E4 E5 E6 E7
**   F    F0 F1 F2 F3 F4 F5 F6 F7
**   G    G0 G1 G2 G3 G4 G5 G6 G7
**   H    H0 H1 H2 H3 H4 H5 H6 H7 */
t0 = unpckhps (v[0], v[1]);
t0 = pshufhw (t0, 0xD8); /* Flip middle 2 high */
t0 = pshuflw (t0, 0xD8); /* Flip middle 2 low */
t1 = unpckhps (v[2], v[3]);
t1 = pshufhw (t1, 0xD8); /* Flip middle 2 high */
t1 = pshuflw (t1, 0xD8); /* Flip middle 2 low */
t2 = unpckhps (v[4], v[5]);
t2 = pshufhw (t2, 0xD8); /* Flip middle 2 high */
t2 = pshuflw (t2, 0xD8); /* Flip middle 2 low */
t3 = unpckhps (v[6], v[7]);
t3 = pshufhw (t3, 0xD8); /* Flip middle 2 high */
t3 = pshuflw (t3, 0xD8); /* Flip middle 2 low */
/*   t0   A4 B4 A5 B5 A6 B6 A7 B7  (A B - 4 5 6 7)
**   t1   C4 D4 C5 D5 C6 D6 C7 D7  (C D - 4 5 6 7)
**   t2   E4 F4 E5 F5 E6 F6 E7 F7  (E F - 4 5 6 7)
**   t3   G4 H4 G5 H5 G6 H6 G7 H7  (G H - 4 5 6 7) */
/* Back to first part, v[0-3] can be re-written now */
/* L3 */
v[0] = unpcklpd (t4, t5);
v[1] = unpckhpd (t4, t5);
v[2] = unpcklpd (t6, t7);
v[3] = unpckhpd (t6, t7);
/* v[0] = A0 B0 C0 D0 E0 F0 G0 H0
** v[1] = A1 B1 C1 D1 E1 F1 G1 H1
** v[2] = A2 B2 C2 D2 E2 F2 G2 H2
** v[3] = A3 B3 C3 D3 E3 F3 G3 H3 */
/* Back to second part, t[4-7] can be re-written now... */
/* L2 */
t4 = unpcklps (t0, t1);
t5 = unpcklps (t2, t3);
t6 = unpckhps (t0, t1);
t7 = unpckhps (t2, t3);
/*   t4   A4 B4 C4 D4 A5 B5 C5 D5 (A B C D - 4 5)
**   t5   E4 F4 G4 H4 E5 F5 G5 H5 (E F G H - 4 5)
**   t6   A6 B6 C6 D6 A7 B7 C7 D7 (A B C D - 6 7)
**   t7   E6 F6 G6 H6 E7 F7 G7 H7 (E F G H - 6 7) */
/* L3 */
v[4] = unpcklpd (t4, t5);
v[5] = unpckhpd (t4, t5);
v[6] = unpcklpd (t6, t7);
v[7] = unpckhpd (t6, t7);
/* v[4] = A4 B4 C4 D4 E4 F4 G4 H4
** v[5] = A5 B5 C5 D5 E5 F5 G5 H5
** v[6] = A6 B6 C6 D6 E6 F6 G6 H6
** v[7] = A7 B7 C7 D7 E7 F7 G7 H7 */

每个 unpck* 需要 3 个周期的延迟，或者 2 个周期的延迟（由 Agner 报告）。这会扼杀使用 SSE（在此代码上）的性能收益的很大一部分，因为这种寄存器舞蹈几乎需要一个周期每个元素。我试图理解 x264 的 x86 转置的 asm 文件，但未能理解宏。

谢谢！

原文

(I'm a newbie to SSE/asm, apologies if this is obvious or redundant)

Is there a better way to transpose 8 SSE registers containing 16-bit values than performing 24 unpck[lh]ps and 8/16+ shuffles and using 8 extra registers? (Note using up to SSSE 3 instructions, Intel Merom, aka lacking BLEND* from SSE4.)

Say you have registers v[0-7] and use t0-t7 as aux registers. In pseudo intrinsics code:

/* Phase 1: process lower parts of the registers */
/* Level 1: work first part of the vectors */
/*   v[0]  A0 A1 A2 A3 A4 A5 A6 A7
**   v[1]  B0 B1 B2 B3 B4 B5 B6 B7
**   v[2]  C0 C1 C2 C3 C4 C5 C6 C7
**   v[3]  D0 D1 D2 D3 D4 D5 D6 D7
**   v[4]  E0 E1 E2 E3 E4 E5 E6 E7
**   v[5]  F0 F1 F2 F3 F4 F5 F6 F7
**   v[6]  G0 G1 G2 G3 G4 G5 G6 G7
**   v[7]  H0 H1 H2 H3 H4 H5 H6 H7 */
t0 = unpcklps (v[0], v[1]); /* Extract first half interleaving */
t1 = unpcklps (v[2], v[3]); /* Extract first half interleaving */
t2 = unpcklps (v[4], v[5]); /* Extract first half interleaving */
t3 = unpcklps (v[6], v[7]); /* Extract first half interleaving */
t0 = pshufhw (t0, 0xD8); /* Flip middle 2 high */
t0 = pshuflw (t0, 0xD8); /* Flip middle 2 low */
t1 = pshufhw (t1, 0xD8); /* Flip middle 2 high */
t1 = pshuflw (t1, 0xD8); /* Flip middle 2 low */
t2 = pshufhw (t2, 0xD8); /* Flip middle 2 high */
t2 = pshuflw (t2, 0xD8); /* Flip middle 2 low */
t3 = pshufhw (t3, 0xD8); /* Flip middle 2 high */
t3 = pshuflw (t3, 0xD8); /* Flip middle 2 low */
/*   t0   A0 B0 A1 B1 A2 B2 A3 B3  (A B - 0 1 2 3)
**   t1   C0 D0 C1 D1 C2 D2 C3 D3  (C D - 0 1 2 3)
**   t2   E0 F0 E1 F1 E2 F2 E3 F3  (E F - 0 1 2 3)
**   t3   G0 H0 G1 H1 G2 H2 G3 H3  (G H - 0 1 2 3) */
/* L2 */
t4 = unpcklps (t0, t1);
t5 = unpcklps (t2, t3);
t6 = unpckhps (t0, t1);
t7 = unpckhps (t2, t3);
/*   t4   A0 B0 C0 D0 A1 B1 C1 D1 (A B C D - 0 1)
**   t5   E0 F0 G0 H0 E1 F1 G1 H1 (E F G H - 0 1)
**   t6   A2 B2 C2 D2 A3 B3 C3 D3 (A B C D - 2 3)
**   t7   E2 F2 G2 H2 E3 F3 G3 H3 (E F G H - 2 3) */
/* Phase 2: same with higher parts of the registers */
/*   A    A0 A1 A2 A3 A4 A5 A6 A7
**   B    B0 B1 B2 B3 B4 B5 B6 B7
**   C    C0 C1 C2 C3 C4 C5 C6 C7
**   D    D0 D1 D2 D3 D4 D5 D6 D7
**   E    E0 E1 E2 E3 E4 E5 E6 E7
**   F    F0 F1 F2 F3 F4 F5 F6 F7
**   G    G0 G1 G2 G3 G4 G5 G6 G7
**   H    H0 H1 H2 H3 H4 H5 H6 H7 */
t0 = unpckhps (v[0], v[1]);
t0 = pshufhw (t0, 0xD8); /* Flip middle 2 high */
t0 = pshuflw (t0, 0xD8); /* Flip middle 2 low */
t1 = unpckhps (v[2], v[3]);
t1 = pshufhw (t1, 0xD8); /* Flip middle 2 high */
t1 = pshuflw (t1, 0xD8); /* Flip middle 2 low */
t2 = unpckhps (v[4], v[5]);
t2 = pshufhw (t2, 0xD8); /* Flip middle 2 high */
t2 = pshuflw (t2, 0xD8); /* Flip middle 2 low */
t3 = unpckhps (v[6], v[7]);
t3 = pshufhw (t3, 0xD8); /* Flip middle 2 high */
t3 = pshuflw (t3, 0xD8); /* Flip middle 2 low */
/*   t0   A4 B4 A5 B5 A6 B6 A7 B7  (A B - 4 5 6 7)
**   t1   C4 D4 C5 D5 C6 D6 C7 D7  (C D - 4 5 6 7)
**   t2   E4 F4 E5 F5 E6 F6 E7 F7  (E F - 4 5 6 7)
**   t3   G4 H4 G5 H5 G6 H6 G7 H7  (G H - 4 5 6 7) */
/* Back to first part, v[0-3] can be re-written now */
/* L3 */
v[0] = unpcklpd (t4, t5);
v[1] = unpckhpd (t4, t5);
v[2] = unpcklpd (t6, t7);
v[3] = unpckhpd (t6, t7);
/* v[0] = A0 B0 C0 D0 E0 F0 G0 H0
** v[1] = A1 B1 C1 D1 E1 F1 G1 H1
** v[2] = A2 B2 C2 D2 E2 F2 G2 H2
** v[3] = A3 B3 C3 D3 E3 F3 G3 H3 */
/* Back to second part, t[4-7] can be re-written now... */
/* L2 */
t4 = unpcklps (t0, t1);
t5 = unpcklps (t2, t3);
t6 = unpckhps (t0, t1);
t7 = unpckhps (t2, t3);
/*   t4   A4 B4 C4 D4 A5 B5 C5 D5 (A B C D - 4 5)
**   t5   E4 F4 G4 H4 E5 F5 G5 H5 (E F G H - 4 5)
**   t6   A6 B6 C6 D6 A7 B7 C7 D7 (A B C D - 6 7)
**   t7   E6 F6 G6 H6 E7 F7 G7 H7 (E F G H - 6 7) */
/* L3 */
v[4] = unpcklpd (t4, t5);
v[5] = unpckhpd (t4, t5);
v[6] = unpcklpd (t6, t7);
v[7] = unpckhpd (t6, t7);
/* v[4] = A4 B4 C4 D4 E4 F4 G4 H4
** v[5] = A5 B5 C5 D5 E5 F5 G5 H5
** v[6] = A6 B6 C6 D6 E6 F6 G6 H6
** v[7] = A7 B7 C7 D7 E7 F7 G7 H7 */

Each unpck* takes 3 cycles of latency, or 2 for reciprocal throughput (reported by Agner.) This is killing big part of the performance gains from using SSE (on this code) because this register dance takes almost one cycle per element. I tried to understand x264's asm file for x86 transpose but failed understanding the macros.

Thanks!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

回首观望 2024-09-02 18:09:16

是的，您总共可以通过 24 条指令来完成：

8 x _mm_unpacklo_epi16/_mm_unpackhi_epi16 (PUNPCKLWD/PUNPCKHWD)
8 x _mm_unpacklo_epi32/_mm_unpackhi_epi32 (PUNPCKLDQ/PUNPCKHDQ)
8 x _mm_unpacklo_epi64/_mm_unpackhi_epi64 (PUNPCKLQDQ/PUNPCKHQDQ)

如果您需要更多详细信息，请告诉我，但这相当明显。

Yes, you can do it in 24 instructions total:

8 x _mm_unpacklo_epi16/_mm_unpackhi_epi16 (PUNPCKLWD/PUNPCKHWD)
8 x _mm_unpacklo_epi32/_mm_unpackhi_epi32 (PUNPCKLDQ/PUNPCKHDQ)
8 x _mm_unpacklo_epi64/_mm_unpackhi_epi64 (PUNPCKLQDQ/PUNPCKHQDQ)

Let me know if you need more details, but it's fairly obvious.

回复收藏 0 原文

染火枫林 2024-09-02 18:09:16

我必须自己做这件事，所以这是我的最终结果。请注意，我使用的加载和存储指令适用于 16 字节对齐数组，其声明使用
m128i* 数组 = (__m128i*) _mm_malloc(N*sizeof(uint16_t), 16);
或者
int16_t array[N]__attribute((aligned(16)));

__m128i *p_input  = (__m128i*)array;
__m128i *p_output = (__m128i*)array;
__m128i a = _mm_load_si128(p_input++);
__m128i b = _mm_load_si128(p_input++);
__m128i c = _mm_load_si128(p_input++);
__m128i d = _mm_load_si128(p_input++);
__m128i e = _mm_load_si128(p_input++);
__m128i f = _mm_load_si128(p_input++);
__m128i g = _mm_load_si128(p_input++);
__m128i h = _mm_load_si128(p_input);

__m128i a03b03 = _mm_unpacklo_epi16(a, b);
__m128i c03d03 = _mm_unpacklo_epi16(c, d);
__m128i e03f03 = _mm_unpacklo_epi16(e, f);
__m128i g03h03 = _mm_unpacklo_epi16(g, h);
__m128i a47b47 = _mm_unpackhi_epi16(a, b);
__m128i c47d47 = _mm_unpackhi_epi16(c, d);
__m128i e47f47 = _mm_unpackhi_epi16(e, f);
__m128i g47h47 = _mm_unpackhi_epi16(g, h);

__m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
__m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
__m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
__m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
__m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
__m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
__m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
__m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);

__m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
__m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
__m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
__m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
__m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
__m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
__m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
__m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);

_mm_store_si128(p_output++, a0b0c0d0e0f0g0h0);
_mm_store_si128(p_output++, a1b1c1d1e1f1g1h1);
_mm_store_si128(p_output++, a2b2c2d2e2f2g2h2);
_mm_store_si128(p_output++, a3b3c3d3e3f3g3h3);
_mm_store_si128(p_output++, a4b4c4d4e4f4g4h4);
_mm_store_si128(p_output++, a5b5c5d5e5f5g5h5);
_mm_store_si128(p_output++, a6b6c6d6e6f6g6h6);
_mm_store_si128(p_output, a7b7c7d7e7f7g7h7);

I had to do this myself, so here is my final result. Note that the load and store instructions I have used are for 16-byte aligned arrays, which were declared using
m128i* array = (__m128i*) _mm_malloc(N*sizeof(uint16_t), 16);
OR
int16_t array[N]__attribute((aligned(16)));

__m128i *p_input  = (__m128i*)array;
__m128i *p_output = (__m128i*)array;
__m128i a = _mm_load_si128(p_input++);
__m128i b = _mm_load_si128(p_input++);
__m128i c = _mm_load_si128(p_input++);
__m128i d = _mm_load_si128(p_input++);
__m128i e = _mm_load_si128(p_input++);
__m128i f = _mm_load_si128(p_input++);
__m128i g = _mm_load_si128(p_input++);
__m128i h = _mm_load_si128(p_input);

__m128i a03b03 = _mm_unpacklo_epi16(a, b);
__m128i c03d03 = _mm_unpacklo_epi16(c, d);
__m128i e03f03 = _mm_unpacklo_epi16(e, f);
__m128i g03h03 = _mm_unpacklo_epi16(g, h);
__m128i a47b47 = _mm_unpackhi_epi16(a, b);
__m128i c47d47 = _mm_unpackhi_epi16(c, d);
__m128i e47f47 = _mm_unpackhi_epi16(e, f);
__m128i g47h47 = _mm_unpackhi_epi16(g, h);

__m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
__m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
__m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
__m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
__m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
__m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
__m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
__m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);

__m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
__m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
__m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
__m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
__m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
__m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
__m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
__m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);

_mm_store_si128(p_output++, a0b0c0d0e0f0g0h0);
_mm_store_si128(p_output++, a1b1c1d1e1f1g1h1);
_mm_store_si128(p_output++, a2b2c2d2e2f2g2h2);
_mm_store_si128(p_output++, a3b3c3d3e3f3g3h3);
_mm_store_si128(p_output++, a4b4c4d4e4f4g4h4);
_mm_store_si128(p_output++, a5b5c5d5e5f5g5h5);
_mm_store_si128(p_output++, a6b6c6d6e6f6g6h6);
_mm_store_si128(p_output, a7b7c7d7e7f7g7h7);

回复收藏 0 原文