fixed tests with new simd functions

This commit is contained in:
Ismael Gomez 2017-01-25 17:30:16 +01:00
parent d65136eb94
commit f2a35c6dd1
4 changed files with 77 additions and 47 deletions

View File

@ -115,9 +115,7 @@ if(CMAKE_COMPILER_IS_GNUCC)
if (HAVE_AVX2) if (HAVE_AVX2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx2 -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx2 -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
else (HAVE_AVX2) else (HAVE_AVX2)
message("NOT HAVE AVX2")
if(HAVE_AVX) if(HAVE_AVX)
message("HAVE AVX")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
elseif(HAVE_SSE) elseif(HAVE_SSE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -msse4.1 -Ofast -funroll-loops -DLV_HAVE_SSE") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -msse4.1 -Ofast -funroll-loops -DLV_HAVE_SSE")

View File

@ -51,6 +51,8 @@ SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, ui
SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len);
SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *abs_square, uint32_t len); SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *abs_square, uint32_t len);
SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len); SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len);

View File

@ -201,19 +201,18 @@ void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
// Used throughout // Used throughout
void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) { void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
#ifndef LV_HAVE_SSE #ifndef LV_HAVE_SSE
int i; int i;
for (i=0;i<len;i++) { for (i=0;i<len;i++) {
z[i] = x[i]*h; z[i] = x[i]*h;
} }
#else #else
cf_t hh; srslte_vec_sc_prod_cfc_simd(x, h, z, len);
__real__ hh = h;
__imag__ hh = 0;
srslte_vec_sc_prod_ccc_simd(x,hh,z,len);
#endif #endif
} }
// Chest UL // Chest UL
void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) { void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
#ifndef LV_HAVE_SSE #ifndef LV_HAVE_SSE

View File

@ -57,8 +57,8 @@ int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len)
__m128i xVal, yVal, zVal; __m128i xVal, yVal, zVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
yVal = _mm_load_si128(yPtr); yVal = _mm_loadu_si128(yPtr);
zVal = _mm_mullo_epi16(xVal, yVal); zVal = _mm_mullo_epi16(xVal, yVal);
@ -69,7 +69,7 @@ int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len)
} }
short dotProdVector[8]; short dotProdVector[8];
_mm_store_si128((__m128i*) dotProdVector, dotProdVal); _mm_storeu_si128((__m128i*) dotProdVector, dotProdVal);
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
result += dotProdVector[i]; result += dotProdVector[i];
} }
@ -96,12 +96,12 @@ void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
__m128i xVal, yVal, zVal; __m128i xVal, yVal, zVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
yVal = _mm_load_si128(yPtr); yVal = _mm_loadu_si128(yPtr);
zVal = _mm_add_epi16(xVal, yVal); zVal = _mm_add_epi16(xVal, yVal);
_mm_store_si128(zPtr, zVal); _mm_storeu_si128(zPtr, zVal);
xPtr ++; xPtr ++;
yPtr ++; yPtr ++;
@ -129,12 +129,12 @@ void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
__m128i xVal, yVal, zVal; __m128i xVal, yVal, zVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
yVal = _mm_load_si128(yPtr); yVal = _mm_loadu_si128(yPtr);
zVal = _mm_sub_epi16(xVal, yVal); zVal = _mm_sub_epi16(xVal, yVal);
_mm_store_si128(zPtr, zVal); _mm_storeu_si128(zPtr, zVal);
xPtr ++; xPtr ++;
yPtr ++; yPtr ++;
@ -161,12 +161,12 @@ void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len)
__m128i xVal, yVal, zVal; __m128i xVal, yVal, zVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
yVal = _mm_load_si128(yPtr); yVal = _mm_loadu_si128(yPtr);
zVal = _mm_mullo_epi16(xVal, yVal); zVal = _mm_mullo_epi16(xVal, yVal);
_mm_store_si128(zPtr, zVal); _mm_storeu_si128(zPtr, zVal);
xPtr ++; xPtr ++;
yPtr ++; yPtr ++;
@ -192,11 +192,11 @@ void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
__m128i xVal, zVal; __m128i xVal, zVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
zVal = _mm_srai_epi16(xVal, k); zVal = _mm_srai_epi16(xVal, k);
_mm_store_si128(zPtr, zVal); _mm_storeu_si128(zPtr, zVal);
xPtr ++; xPtr ++;
zPtr ++; zPtr ++;
@ -223,8 +223,8 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
__m128i xVal, lutVal; __m128i xVal, lutVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
lutVal = _mm_load_si128(lutPtr); lutVal = _mm_loadu_si128(lutPtr);
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, i); int16_t x = (int16_t) _mm_extract_epi16(xVal, i);
@ -295,12 +295,12 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
__m128 xVal, yVal, zVal; __m128 xVal, yVal, zVal;
for(;number < points; number++){ for(;number < points; number++){
xVal = _mm_load_ps(xPtr); xVal = _mm_loadu_ps(xPtr);
yVal = _mm_load_ps(yPtr); yVal = _mm_loadu_ps(yPtr);
zVal = _mm_add_ps(xVal, yVal); zVal = _mm_add_ps(xVal, yVal);
_mm_store_ps(zPtr, zVal); _mm_storeu_ps(zPtr, zVal);
xPtr += 4; xPtr += 4;
yPtr += 4; yPtr += 4;
@ -338,18 +338,19 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
const float* yPtr = (const float*) y; const float* yPtr = (const float*) y;
for(; number < halfPoints; number++){ for(; number < halfPoints; number++){
xVal = _mm_load_ps(xPtr); xVal = _mm_loadu_ps(xPtr);
yVal = _mm_load_ps(yPtr); yVal = _mm_loadu_ps(yPtr);
zVal = _mm_complexmul_ps(xVal, yVal); zVal = _mm_complexmul_ps(xVal, yVal);
_mm_store_ps(zPtr, zVal); _mm_storeu_ps(zPtr, zVal);
xPtr += 4; xPtr += 4;
yPtr += 4; yPtr += 4;
zPtr += 4; zPtr += 4;
} }
if((len % 2) != 0){ number = halfPoints * 2;
*zPtr = (*xPtr) * (*yPtr); for(;number < len; number++){
z[number] = x[number] * y[number];
} }
#endif #endif
} }
@ -373,18 +374,19 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
const float* yPtr = (const float*) y; const float* yPtr = (const float*) y;
for(; number < halfPoints; number++){ for(; number < halfPoints; number++){
xVal = _mm_load_ps(xPtr); xVal = _mm_loadu_ps(xPtr);
yVal = _mm_load_ps(yPtr); yVal = _mm_loadu_ps(yPtr);
zVal = _mm_complexmulconj_ps(xVal, yVal); zVal = _mm_complexmulconj_ps(xVal, yVal);
_mm_store_ps(zPtr, zVal); _mm_storeu_ps(zPtr, zVal);
xPtr += 4; xPtr += 4;
yPtr += 4; yPtr += 4;
zPtr += 4; zPtr += 4;
} }
if((len % 2) != 0){ number = halfPoints * 2;
*zPtr = (*xPtr) * (*yPtr); for(;number < len; number++){
z[number] = x[number] * conjf(y[number]);
} }
#endif #endif
} }
@ -404,7 +406,7 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
for(;number < halfPoints; number++){ for(;number < halfPoints; number++){
xVal = _mm_load_ps(xPtr); xVal = _mm_loadu_ps(xPtr);
tmp1 = _mm_mul_ps(xVal,yl); tmp1 = _mm_mul_ps(xVal,yl);
xVal = _mm_shuffle_ps(xVal,xVal,0xB1); xVal = _mm_shuffle_ps(xVal,xVal,0xB1);
tmp2 = _mm_mul_ps(xVal,yh); tmp2 = _mm_mul_ps(xVal,yh);
@ -415,12 +417,43 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
zPtr += 4; zPtr += 4;
} }
if((len % 2) != 0) { number = halfPoints * 2;
*zPtr = (*xPtr) * h; for(;number < len; number++){
z[number] = x[number] * h;
} }
#endif #endif
} }
void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len) {
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int halfPoints = len / 2;
__m128 xVal, hVal, zVal;
float* zPtr = (float*) z;
const float* xPtr = (const float*) x;
// Set up constant scalar vector
hVal = _mm_set_ps1(h);
for(;number < halfPoints; number++){
xVal = _mm_loadu_ps(xPtr);
zVal = _mm_mul_ps(xVal,hVal);
_mm_storeu_ps(zPtr,zVal);
xPtr += 4;
zPtr += 4;
}
number = halfPoints * 2;
for(;number < len; number++){
z[number] = x[number] * h;
}
#endif
}
void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) { void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
unsigned int number = 0; unsigned int number = 0;
@ -431,22 +464,20 @@ void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
__m128 xVal1, xVal2, zVal; __m128 xVal1, xVal2, zVal;
for(; number < quarterPoints; number++){ for(; number < quarterPoints; number++){
xVal1 = _mm_load_ps(xPtr); xVal1 = _mm_loadu_ps(xPtr);
xPtr += 4; xPtr += 4;
xVal2 = _mm_load_ps(xPtr); xVal2 = _mm_loadu_ps(xPtr);
xPtr += 4; xPtr += 4;
xVal1 = _mm_mul_ps(xVal1, xVal1); xVal1 = _mm_mul_ps(xVal1, xVal1);
xVal2 = _mm_mul_ps(xVal2, xVal2); xVal2 = _mm_mul_ps(xVal2, xVal2);
zVal = _mm_hadd_ps(xVal1, xVal2); zVal = _mm_hadd_ps(xVal1, xVal2);
_mm_store_ps(zPtr, zVal); _mm_storeu_ps(zPtr, zVal);
zPtr += 4; zPtr += 4;
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(; number < len; number++){ for(;number < len; number++){
float val1Real = *xPtr++; z[number] = creal(x[number]) * creal(x[number]) + cimag(x[number])*cimag(x[number]);
float val1Imag = *xPtr++;
*zPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
} }
#endif #endif
} }