A = Expected to give significant performance gain over non-intrinsic-based code equivalent.
B = Non-intrinsic-based source code would be better; the intrinsic's implementation may map directly to native instructions, but they offer no significant performance gain.
C = Requires contorted implementation for particular microarchitecture. Will result in very poor performance if used.
Intrinsic | Across All IA | MMX(TM) Technology | Streaming SIMD Extensions | Streaming SIMD Extensions 2 | Itanium(TM) Architecture |
---|---|---|---|---|---|
void _mm_empty(void) |
N/A |
A |
A |
A |
B |
__m64 _m_from_int (int i) _m64 _mm_cvtsi32_si64 |
N/A |
A |
A |
A |
A |
int _m_to_int (__m64 m) _m64 _mm_cvtsi64_si32 |
N/A |
A |
A |
A |
A |
__m64 _m_packsswb (__m64 m1, __m64 m2) __m64 _mm_packs_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_packssdw (__m64 m1, __m64 m2) __m64 _mm_packs_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_packuswb (__m64 m1, __m64 m2) __m64 _mm_packs_pu16 |
N/A |
A |
A |
A |
A |
__m64 _m_punpckhbw (__m64 m1, __m64 m2) __m64 _mm_unpackhi_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_punpckhwd (__m64 m1, __m64 m2) __m64 _mm_unpackhi_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_punpckhdq (__m64 m1, __m64 m2) __m64 _mm_unpackhi_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_punpcklbw (__m64 m1, __m64 m2) __m64 _mm_unpacklo_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_punpcklwd (__m64 m1, __m64 m2) __m64 _mm_unpacklo_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_punpckldq (__m64 m1, __m64 m2) __m64 _mm_unpacklo_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_paddb (__m64 m1, __m64 m2) __m64 _mm_add_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_paddw (__m64 m1, __m64 m2) __m64 _mm_add_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_paddd (__m64 m1, __m64 m2) __m64 _mm_add_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_paddsb (__m64 m1, __m64 m2) __m64 _mm_adds_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_paddsw (__m64 m1, __m64 m2) __m64 _mm_adds_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_paddusb (__m64 m1, __m64 m2) __m64 _mm_adds_pu8 |
N/A |
A |
A |
A |
A |
__m64 _m_paddusw (__m64 m1, __m64 m2) __m64 _mm_adds_pu16 |
N/A |
A |
A |
A |
A |
__m64 _m_psubb (__m64 m1, __m64 m2) __m64 _mm_sub_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_psubw (__m64 m1, __m64 m2) __m64 _mm_sub_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psubd (__m64 m1, __m64 m2) __m64 _mm_sub_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_psubsb (__m64 m1, __m64 m2) __m64 _mm_subs_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_psubsw(__m64 m1, __m64 m2) __m64 _mm_subs_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psubusb(__m64 m1, __m64 m2) __m64 _mm_subs_pu8 |
N/A |
A |
A |
A |
A |
__m64 _m_psubusw(__m64 m1, __m64 m2) __m64 _mm_subs_pu16 |
N/A |
A |
A |
A |
A |
__m64 _m_pmaddwd (__m64 m1, __m64 m2) __m64 _mm_madd_pi16 |
N/A |
A |
A |
A |
C |
__m64 _m_pmulhw (__m64 m1, __m64 m2) __m64 _mm_mulhi_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_pmullw (__m64 m1, __m64 m2) __m64 _mm_mullo_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psllw (__m64 m, __m64 count) __m64 _mm_sll_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psllwi (__m64 m, int count) __m64 _mm_slli_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_pslld (__m64 m, int count) __m64 _mm_sll_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_pslldi (__m64 m, int count) __m64 _mm_slli_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_psllq (__m64 m, __m64 count) __m64 _mm_sll_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_psllqi (__m64 m, __m64 count) __m64 _mm_slli_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_psraw (__m64 m, __m64 count) __m64 _mm_sra_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psrawi (__m64 m, int count) __m64 _mm_srai_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psrad (__m64 m, __m64 count) __m64 _mm_sra_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_psradi (__m64 m, int count) __m64 _mm_srai_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_psrlw (__m64 m, __m64 count) __m64 _mm_srl_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psrlwi (__m64 m, int count) __m64 _mm_srli_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_psrld (__m64 m, __m64 count) __m64 _mm_srl_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_psrldi (__m64 m, int count) __m64 _mm_srli_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_psrlq (__m64 m, __m64 count) __m64 _mm_srl_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_psrlqi (__m64 m, int count) __m64 _mm_srli_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_pand (__m64 m1, __m64 m2) __m64 _mm_and_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_pandn (__m64 m1, __m64 m2) __m64 _mm_andnot_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_por (__m64 m1, __m64 m2) __m64 _mm_or_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_pxor (__m64 m1, __m64 m2) __m64 _mm_xor_si64 |
N/A |
A |
A |
A |
A |
__m64 _m_pcmpeqb (__m64 m1, __m64 m2) __m64 _mm_cmpeq_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_pcmpeqw (__m64 m1, __m64 m2) __m64 _mm_cmpeq_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_pcmpeqd (__m64 m1, __m64 m2) __m64 _mm_cmpeq_pi32 |
N/A |
A |
A |
A |
A |
__m64 _m_pcmpgtb (__m64 m1, __m64 m2) __m64 _mm_cmpgt_pi8 |
N/A |
A |
A |
A |
A |
__m64 _m_pcmpgtw (__m64 m1, __m64 m2) __m64 _mm_cmpgt_pi16 |
N/A |
A |
A |
A |
A |
__m64 _m_pcmpgtd (__m64 m1, __m64 m2) __m64 _mm_cmpgt_pi32 |
N/A |
A |
A |
A |
A |
__m64 _mm_setzero_si64 () |
N/A |
A |
A |
A |
A |
__m64 __mm_set_pi32 ( int i1, int i0) |
N/A |
A |
A |
A |
A |
__m64 __mm_set_pi16 ( short w3, short w2, short w1, short w0) |
N/A |
A |
A |
A |
C |
__m64 __mm_set_pi8 ( char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) |
N/A |
A |
A |
A |
C |
__m64 __mm_set1_pi32 ( int I) |
N/A |
A |
A |
A |
A |
__m64 __mm_set1_pi16 ( short w) |
N/A |
A |
A |
A |
A |
__m64 __mm_set1_pi8 ( char b) |
N/A |
A |
A |
A |
A |
__m64 __mm_setr_pi32 ( int i1, int i0) |
N/A |
A |
A |
A |
A |
__m64 __mm_setr_pi16 (short w3, short w2, short w1, short w0 ) |
N/A |
A |
A |
A |
C |
__m64 __mm_setr_pi8 (char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) |
N/A |
A |
A |
A |
C |
_mm_empty is implemented in Itanium instructions as a NOP for source compatibility only.