fixed bug in sub_sse() and added couple of avx functions

This commit is contained in:
Ismael Gomez 2017-07-07 18:44:17 +02:00
parent 0dae4a00c4
commit f629e10fcf
3 changed files with 76 additions and 5 deletions

View File

@ -49,8 +49,12 @@ SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t l
SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len);

View File

@ -101,9 +101,13 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
for (i=0;i<len;i++) {
z[i] = x[i]-y[i];
}
#else
#ifdef LV_HAVE_AVX
srslte_vec_sub_fff_avx(x, y, z, len);
#else
srslte_vec_sub_fff_sse(x, y, z, len);
#endif
#endif
}
void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
@ -134,7 +138,11 @@ void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
z[i] = x[i]+y[i];
}
#else
srslte_vec_sum_fff_sse(x, y, z, len);
#ifdef LV_HAVE_AVX
srslte_vec_sum_fff_avx(x, y, z, len);
#else
srslte_vec_sum_fff_sse(x, y, z, len);
#endif
#endif
}
@ -246,7 +254,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
for (i=0;i<len;i++) {
z[i] = x[i]*h;
}
#endif
#endif
}

View File

@ -501,6 +501,36 @@ void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
#endif
}
void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_AVX
unsigned int number = 0;
const unsigned int points = len / 8;
const float* xPtr = (const float*) x;
const float* yPtr = (const float*) y;
float* zPtr = (float*) z;
__m256 xVal, yVal, zVal;
for(;number < points; number++){
xVal = _mm256_loadu_ps(xPtr);
yVal = _mm256_loadu_ps(yPtr);
zVal = _mm256_add_ps(xVal, yVal);
_mm256_storeu_ps(zPtr, zVal);
xPtr += 8;
yPtr += 8;
zPtr += 8;
}
for(number = points * 8;number < len; number++){
z[number] = x[number] + y[number];
}
#endif
}
void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_SSE
unsigned int number = 0;
@ -525,14 +555,43 @@ void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
zPtr += 4;
}
number = points * 4;
for(;number < len; number++){
z[number] = x[number] + y[number];
for(number = points * 4;number < len; number++){
z[number] = x[number] - y[number];
}
#endif
}
void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
const float* xPtr = (const float*) x;
const float* yPtr = (const float*) y;
float* zPtr = (float*) z;
__m256 xVal, yVal, zVal;
for(;number < points; number++){
xVal = _mm256_loadu_ps(xPtr);
yVal = _mm256_loadu_ps(yPtr);
zVal = _mm256_sub_ps(xVal, yVal);
_mm256_storeu_ps(zPtr, zVal);
xPtr += 8;
yPtr += 8;
zPtr += 8;
}
for(number = points * 8;number < len; number++){
z[number] = x[number] - y[number];
}
#endif
}
#ifdef LV_HAVE_SSE
static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
__m128 yl, yh, tmp1, tmp2;