This sample program uses the F32vec4 class to average the elements of a 20 element floating point array.
// Include Streaming SIMD Extension Class Definitions #include <fvec.h>
// Shuffle any 2 single precision floating point from a // into low 2 SP FP and shuffle any 2 SP FP from b // into high 2 SP FP of destination #define SHUFFLE(a,b,i) (F32vec4)_mm_shuffle_ps(a,b,i) #include <stdio.h> #define SIZE 20
// Global variables float result; _MM_ALIGN 16 float array[SIZE];
//***************************************************** // Function: Add20ArrayElements // Add all the elements of a 20 element array //*****************************************************
void Add20ArrayElements (F32vec4 *array, float *result) { F32vec4 vec0, vec1; vec0 = _mm_load_ps ((float *) array); // Load array's first 4 floats
//***************************************************** // Add all elements of the array, 4 elements at a time //******************************************************
vec0 += array[1]; // Add elements 5-8 vec0 += array[2]; // Add elements 9-12 vec0 += array[3]; // Add elements 13-16 vec0 += array[4]; // Add elements 17-20
//***************************************************** // There are now 4 partial sums. // Add the 2 lowers to the 2 raises, // then add those 2 results together //*****************************************************
vec1 = SHUFFLE(vec1, vec0, 0x40); vec0 += vec1; vec1 = SHUFFLE(vec1, vec0, 0x30); vec0 += vec1; vec0 = SHUFFLE(vec0, vec0, 2); _mm_store_ss (result, vec0); // Store the final sum }
void main(int argc, char *argv[]) { int i; // Initialize the array for (i=0; i < SIZE; i++) { array[i] = (float) i; }
// Call function to add all array elements Add20ArrayElements (array, &result);
// Print average array element value printf ("Average of all array values = %f\n", result/20.); printf ("The correct answer is %f\n\n\n", 9.5); } |