|
Manual
Processor Dispatch: Example
This example shows multiple optimized functions in a single binary file.
You can use manual processor dispatch to support up to seven different
versions of any one function, targeting up to seven different processors.
#include <mmintrin.h>
/* Pentium processor function does not use intrinsics to add two arrays. */
__declspec(cpu_specific(pentium))
void array_sum(int *r, int *a, int *b,size_t l)
{
for (; length > 0; l--)
*result++ = *a++ + *b++;
}
/* Implementation for a Pentium processor with MMX technology uses
an MMX instruction intrinsic to add four elements simultaneously. */
__declspec(cpu_specific(pentium_MMX))
void array_sum(int *r,int const *a, int *b, size_t l)
{
__m64 *mmx_result = (__m64 *)result;
__m64 const *mmx_a = (__m64 const *)a;
__m64 const *mmx_b = (__m64 const *)b;
for (; length > 3; length -= 4)
*mmx_result++ = _mm_add_pi16(*mmx_a++, *mmx_b++);
/* The following code, which takes care of excess elements, is not
needed if the array sizes passed are known to be multiples of four. */
result = (unsigned short *)mmx_r;
a = (unsigned short const *)mmx_a;
b = (unsigned short const *)mmx_b;
for (; length > 0; l--)
*result++ = *a++ + *b++;
}
/* The function stub informs the compiler to generate the
CPU-dispatch function listed in the cpu_dispatch clause. */
__declspec(cpu_dispatch(pentium, pentium_MMX))
void array_sum (int *r,int const *a, int *b, size_t l) )
{
/* The function body is empty. */
}
|
|