A = Expected to give significant performance gain over non-intrinsic-based code equivalent.
B = Non-intrinsic-based source code would be better; the intrinsic's implementation may map directly to native instructions, but they offer no significant performance gain.
C = Requires contorted implementation for particular microarchitecture. Will result in very poor performance if used.
Intrinsic | Across All IA | MMX(TM) Technology | Streaming SIMD Extensions | Streaming SIMD Extensions 2 | Itanium(TM) Architecture |
---|---|---|---|---|---|
__m128 _mm_add_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_add_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_sub_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_sub_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_mul_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_mul_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_div_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_div_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_sqrt_ss (__m128 a) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_sqrt_ps (__m128 a) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_rcp_ss (__m128 a) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_rcp_ps (__m128 a) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_rsqrt_ss (__m128 a) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_rsqrt_ps (__m128 a) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_min_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_min_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_max_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_max_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_and_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_andnot_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_or_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_xor_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpeq_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpeq_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmplt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmplt_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmple_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmple_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpgt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpgt_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpge_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpge_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpneq_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpneq_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpnle_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpnle_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpngt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpngt_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpnge_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpnge_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpord_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpord_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cmpunord_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
__m128 _mm_cmpunord_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
int _mm_comieq_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_comilt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_comile_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_comigt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_comige_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_comineq_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_ucomieq_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_ucomilt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_ucomile_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_ucomigt_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_ucomige_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_ucomineq_ss (__m128 a, __m128 b) |
N/A |
N/A |
B |
B |
B |
int _mm_cvtss_si32 (__m128 a) int _mm_cvt_ss2si |
N/A |
N/A |
A |
A |
B |
__m64 _mm_cvtps_pi32 (__m128 a) int _mm_cvt_ps2pi |
N/A |
N/A |
A |
A |
A |
int _mm_cvttss_si32 (__m128 a) int _mm_cvtt_ss2si |
N/A |
N/A |
A |
A |
B |
__m64 _mm_cvttps_pi32 (__m128 a) int _mm_cvtt_ps2pi |
N/A |
N/A |
A |
A |
A |
__m128 _mm_cvtsi32_ss (__m128 a, int b) int _mm_cvt_si2ss |
N/A |
N/A |
A |
A |
B |
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b) int _mm_cvt_pi2ps |
N/A |
N/A |
A |
A |
C |
__m128 _mm_cvtpi16_ps (__m64 a) |
N/A |
N/A |
A |
A |
C |
__m128 _mm_cvtpu16_ps (__m64 a) |
N/A |
N/A |
A |
A |
C |
__m128 _mm_cvtpi8_ps (__m64 a) |
N/A |
N/A |
A |
A |
C |
__m128 _mm_cvtpu8_ps (__m64 a) |
N/A |
N/A |
A |
A |
C |
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) |
N/A |
N/A |
A |
A |
C |
__m64 _mm_cvtps_pi16 (__m128 a) |
N/A |
N/A |
A |
A |
C |
__m64 _mm_cvtps_pi8 (__m128 a) |
N/A |
N/A |
A |
A |
C |
__m128 _mm_move_ss (__m128 a __m128 b) |
N/A |
N/A |
A |
A |
A |
int _mm_shuffle_ps (__m128 a) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_unpackhi_ps (__m128 a, __m128 b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_unpacklo_ps (__m128 a, __m128b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_movehl_ps (__m128 a, __m128b) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_movelh_ps (__m128 a, __m128b) |
N/A |
N/A |
A |
A |
A |
int _mm_movemask_ps (__m128 a) |
N/A |
N/A |
A |
A |
C |
unsigned int _mm_getcsr (void) |
N/A |
N/A |
A |
A |
A |
void _mm_setcsr (unsigned int i) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_loadh_pi (__m128 a, __m64 *p) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_loadl_pi (__m128 a, __m64 *p) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_load_ss (__m128 a, float *p) |
N/A |
N/A |
A |
A |
B |
__m128 _mm_load1_ps (__m128 a, float *p) __m128 _mm_load_ps1 |
N/A |
N/A |
A |
A |
A |
__m128 _mm_load_ps (__m128 a, float *p) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_loadu_ps (__m128 a, float *p) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_loadr_ps (__m128 a, float *p) |
N/A |
N/A |
A |
A |
A |
void _mm_storeh_pi ( __m64 *p, __m128 a) |
N/A |
N/A |
A |
A |
A |
void _mm_storel_pi ( __m64 *p, __m128 a) |
N/A |
N/A |
A |
A |
A |
Void _mm_store_ss ( float *p, __m128 a) |
N/A |
N/A |
A |
A |
A |
Void _mm_store_ps ( float *p, __m128 a) |
N/A |
N/A |
A |
A |
A |
Void _mm_store1_ps ( float *p, __m128 a) Void _mm_store_ps1 |
N/A |
N/A |
A |
A |
A |
Void _mm_storeu_ps ( float *p, __m128 a) |
N/A |
N/A |
A |
A |
A |
Void _mm_storer_ps ( float *p, __m128 a) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_set_ss ( float w) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_set1_ps ( float w) __m128 _mm_set_ps1 |
N/A |
N/A |
A |
A |
A |
__m128 _mm_set_ps ( float z, float y, float x, float w) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_setr_ps ( float z, float y, float x, float w) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_setzero_ps ( void) |
N/A |
N/A |
A |
A |
A |
void _mm_prefetch (char *p, int i) |
N/A |
N/A |
A |
A |
A |
void _mm_stream_pi (__m64 *p, __m64 *a) |
N/A |
N/A |
A |
A |
A |
__m128 _mm_stream_ps ( float *p __mm128 a) |
N/A |
N/A |
A |
A |
A |
void _mm_sfence (void) |
N/A |
N/A |
A |
A |
A |
int _mm_extract_pi16 ( __m64 a, int n) int _m_pextrw |
N/A |
N/A |
A |
A |
A |
__m64 _mm_insert_pi16 ( __m64 a, int d, int n) __m64 _m_pinsrw |
N/A |
N/A |
A |
A |
A |
__m64 _mm_max_pi16 ( __m64 a, __m64 b) __m64 _m_pmaxsw |
N/A |
N/A |
A |
A |
A |
__m64 _mm_max_pu8 ( __m64 a, __m64 b) __m64 _m_pmaxub |
N/A |
N/A |
A |
A |
A |
__m64 _mm_min_pi16 ( __m64 a, __m64 b) __m64 _m_pminsw |
N/A |
N/A |
A |
A |
A |
__m64 _mm_min_pu8 ( __m64 a, __m64 b) __m64 _m_pminub |
N/A |
N/A |
A |
A |
A |
int _mm_movemask_pi8 ( __m64 a) __m64 _m_pmovmskb |
N/A |
N/A |
A |
A |
C |
__m64 _mm_mulhi_pu16 ( __m64 a, __m64 b) __m64 _m_pmulhuw |
N/A |
N/A |
A |
A |
A |
__m64 _mm_shuffle_pi16 ( __m64 a, int n) __m64 _m_pshufw |
N/A |
N/A |
A |
A |
A |
void _mm_maskmove_si64 ( __m64 d, __m64 n, char *p) void _m_maskmovq |
N/A |
N/A |
A |
A |
C |
__m64 _mm_avg_pu8 ( __m64 a, __m64 b) __m64 _m_pavgb |
N/A |
N/A |
A |
A |
A |
__m64 _mm_avg_pu16 ( __m64 a, __m64 b) __m64 _m_pavgw |
N/A |
N/A |
A |
A |
A |
__m64 _mm_sad_pu8 ( __m64 a, __m64 b) __m64 _m_psadbw |
N/A |
N/A |
A |
A |
A |