The floating-point intrinsics listed below are designed for the Intel processors code-named "Prescott".
extern __m128 _mm_addsub_ps(__m128 a, __m128 b);
Subtracts even vector elements while adding odd vector elements.
r0 := a0 - b0;
r1 := a1 + b1;
r2 := a2 - b2;
r3 := a3 + b3;
extern __m128 _mm_hadd_ps(__m128 a, __m128 b);
Adds adjacent vector elements.
r0 := a0 + a1;
r1 := a2 + a3;
r2 := b0 + b1;
r3 := b2 + b3;
extern __m128 _mm_hsub_ps(__m128 a, __m128 b);
Subtracts adjacent vector elements.
r0 := a0 - a1;
r1 := a2 - a3;
r2 := b0 - b1;
r3 := b2 - b3;
extern __m128 _mm_movehdup_ps(__m128 a);
Duplicates odd vector elements into even vector elements.
r0 := a1;
r1 := a1;
r2 := a3;
r3 := a3;
extern __m128 _mm_moveldup_ps(__m128 a);
Duplicates even vector elements into odd vector elements.
r0 := a0;
r1 := a0;
r2 := a2;
r3 := a2;
extern __m128d _mm_addsub_pd(__m128d a, __m128d b);
extern __m128d _mm_hadd_pd(__m128d a, __m128d b);Adds upper vector element while subtracting lower vector element.
r0 := a0 - b0;
r1 := a1 + b1;
Adds adjacent vector elements.
r0 := a0 + a1;
r1 := b0 + b1;
extern __m128d _mm_hsub_pd(__m128d a, __m128d b);
Subtracts adjacent vector elements.
r0 := a0 - a1;
r1 := b0 - b1;
extern __m128d _mm_loaddup_pd(double const * dp);
Duplicates a double value into upper and lower vector elements.
r0 := *dp;
r1 := *dp;
extern __m128d _mm_movedup_pd(__m128d a);
Duplicates lower vector element into upper vector element.
r0 := a0;
r1 := a0;