Added Vector max abs SIMD function

This commit is contained in:
Xavier Arteaga 2018-06-27 17:01:37 +02:00
parent e18ba937dc
commit f01f7b4945
5 changed files with 122 additions and 19 deletions

View File

@ -203,7 +203,7 @@ static inline simd_f_t srslte_simd_f_loadu(const float *ptr) {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_loadu_ps(ptr); return _mm_loadu_ps(ptr);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
return vld1q_f32(ptr); return vld1q_f32(ptr);
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -396,7 +396,7 @@ static inline simd_f_t srslte_simd_f_swap(simd_f_t a) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_permute_ps(a, 0b10110001); return _mm256_permute_ps(a, 0b10110001);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_shuffle_ps(a, a, 0b10110001); return _mm_shuffle_ps(a, a, 0b10110001);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
@ -428,7 +428,7 @@ static inline simd_f_t srslte_simd_f_hadd(simd_f_t a, simd_f_t b) {
simd_f_t b1 = _mm256_permute2f128_ps(a, b, 0b00110001); simd_f_t b1 = _mm256_permute2f128_ps(a, b, 0b00110001);
return _mm256_hadd_ps(a1, b1); return _mm256_hadd_ps(a1, b1);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_hadd_ps(a, b); return _mm_hadd_ps(a, b);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
@ -446,7 +446,7 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_sqrt_ps(a); return _mm256_sqrt_ps(a);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_sqrt_ps(a); return _mm_sqrt_ps(a);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
@ -471,7 +471,7 @@ static inline simd_f_t srslte_simd_f_neg(simd_f_t a) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_xor_ps(_mm256_set1_ps(-0.0f), a); return _mm256_xor_ps(_mm256_set1_ps(-0.0f), a);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_xor_ps(_mm_set1_ps(-0.0f), a); return _mm_xor_ps(_mm_set1_ps(-0.0f), a);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
@ -489,7 +489,7 @@ static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_xor_ps(mask, a); return _mm256_xor_ps(mask, a);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_xor_ps(mask, a); return _mm_xor_ps(mask, a);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
@ -500,6 +500,25 @@ static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) {
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
} }
static inline simd_f_t srslte_simd_f_abs(simd_f_t a) {
#ifdef LV_HAVE_AVX512
return _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_andnot_ps(_mm_set1_ps(-0.0f), a);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
return vqabsq_s32(a);
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
#endif /* SRSLTE_SIMD_F_SIZE */ #endif /* SRSLTE_SIMD_F_SIZE */
@ -836,7 +855,7 @@ static inline simd_cf_t srslte_simd_cf_conjprod (simd_cf_t a, simd_cf_t b) {
_mm_mul_ps(a.im, b.im)); _mm_mul_ps(a.im, b.im));
ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re), ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re),
_mm_mul_ps(a.re, b.im)); _mm_mul_ps(a.re, b.im));
#else #else
#ifdef HAVE_NEON #ifdef HAVE_NEON
ret.val[0] = vaddq_f32(vmulq_f32(a.val[0],b.val[0]), ret.val[0] = vaddq_f32(vmulq_f32(a.val[0],b.val[0]),
vmulq_f32(a.val[1],b.val[1])); vmulq_f32(a.val[1],b.val[1]));
@ -883,7 +902,7 @@ static inline simd_cf_t srslte_simd_cf_sub (simd_cf_t a, simd_cf_t b) {
ret.re = _mm256_sub_ps(a.re, b.re); ret.re = _mm256_sub_ps(a.re, b.re);
ret.im = _mm256_sub_ps(a.im, b.im); ret.im = _mm256_sub_ps(a.im, b.im);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
ret.re = _mm_sub_ps(a.re, b.re); ret.re = _mm_sub_ps(a.re, b.re);
ret.im = _mm_sub_ps(a.im, b.im); ret.im = _mm_sub_ps(a.im, b.im);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
@ -942,7 +961,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
ret.re = _mm256_mul_ps(a.re, rcp); ret.re = _mm256_mul_ps(a.re, rcp);
ret.im = _mm256_mul_ps(neg_a_im, rcp); ret.im = _mm256_mul_ps(neg_a_im, rcp);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
simd_f_t a2re = _mm_mul_ps(a.re, a.re); simd_f_t a2re = _mm_mul_ps(a.re, a.re);
simd_f_t a2im = _mm_mul_ps(a.im, a.im); simd_f_t a2im = _mm_mul_ps(a.im, a.im);
simd_f_t mod2 = _mm_add_ps(a2re, a2im); simd_f_t mod2 = _mm_add_ps(a2re, a2im);
@ -951,7 +970,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
ret.re = _mm_mul_ps(a.re, rcp); ret.re = _mm_mul_ps(a.re, rcp);
ret.im = _mm_mul_ps(neg_a_im, rcp); ret.im = _mm_mul_ps(neg_a_im, rcp);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]); simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]);
simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]); simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]);
simd_f_t mod2 = vaddq_f32(a2re, a2im); simd_f_t mod2 = vaddq_f32(a2re, a2im);
@ -1074,10 +1093,10 @@ static inline simd_i_t srslte_simd_i_load(int *x) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_load_si256((__m256i*)x); return _mm256_load_si256((__m256i*)x);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_load_si128((__m128i*)x); return _mm_load_si128((__m128i*)x);
#else #else
#ifdef HAVE_NEON #ifdef HAVE_NEON
return vld1q_s32((int*)x); return vld1q_s32((int*)x);
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -1110,10 +1129,10 @@ static inline simd_i_t srslte_simd_i_set1(int x) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_set1_epi32(x); return _mm256_set1_epi32(x);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_set1_epi32(x); return _mm_set1_epi32(x);
#else #else
#ifdef HAVE_NEON #ifdef HAVE_NEON
return vdupq_n_s32(x); return vdupq_n_s32(x);
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -1146,7 +1165,7 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_cmp_ps(a, b, _CMP_GT_OS); return _mm256_cmp_ps(a, b, _CMP_GT_OS);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return (simd_sel_t) _mm_cmpgt_ps(a, b); return (simd_sel_t) _mm_cmpgt_ps(a, b);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
@ -1164,7 +1183,7 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b, selector); return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b, selector);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector); return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON

View File

@ -136,6 +136,8 @@ SRSLTE_API void srslte_vec_apply_cfo_simd(const cf_t *x, float cfo, cf_t *z, int
/* SIMD Find Max functions */ /* SIMD Find Max functions */
SRSLTE_API uint32_t srslte_vec_max_fi_simd(const float *x, const int len); SRSLTE_API uint32_t srslte_vec_max_fi_simd(const float *x, const int len);
SRSLTE_API uint32_t srslte_vec_max_abs_fi_simd(const float *x, const int len);
SRSLTE_API uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len); SRSLTE_API uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -48,8 +48,7 @@ bool verbose = false;
#define MAX_FUNCTIONS (64) #define MAX_FUNCTIONS (64)
#define MAX_BLOCKS (16) #define MAX_BLOCKS (16)
#define RANDOM_F() (((float) rand()) / ((float) RAND_MAX) * 2.0f - 1.0f)
#define RANDOM_F() ((float)rand())/((float)RAND_MAX)
#define RANDOM_S() ((int16_t)(rand() & 0x800F)) #define RANDOM_S() ((int16_t)(rand() & 0x800F))
#define RANDOM_B() ((int8_t)(rand() & 0x8008)) #define RANDOM_B() ((int8_t)(rand() & 0x8008))
#define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F()) #define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F())
@ -705,6 +704,29 @@ TEST(srslte_vec_max_fi,
free(x); free(x);
) )
TEST(srslte_vec_max_abs_fi,
MALLOC(float, x);
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_F();
}
uint32_t max_index = 0;
TEST_CALL(max_index = srslte_vec_max_abs_fi(x, block_size);)
float gold_value = -INFINITY;
uint32_t gold_index = 0;
for (int i = 0; i < block_size; i++) {
if (gold_value < fabsf(x[i])) {
gold_value = fabsf(x[i]);
gold_index = i;
}
}
mse = (gold_index != max_index) ? 1:0;
free(x);
)
TEST(srslte_vec_max_abs_ci, TEST(srslte_vec_max_abs_ci,
MALLOC(cf_t, x); MALLOC(cf_t, x);
@ -899,6 +921,9 @@ int main(int argc, char **argv) {
passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size); passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++; func_count++;
passed[func_count][size_count] = test_srslte_vec_max_abs_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size); passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++; func_count++;

View File

@ -369,7 +369,7 @@ uint32_t srslte_vec_max_fi(const float *x, const uint32_t len) {
} }
uint32_t srslte_vec_max_abs_fi(const float *x, const uint32_t len) { uint32_t srslte_vec_max_abs_fi(const float *x, const uint32_t len) {
return srslte_vec_max_fi_simd(x, len); return srslte_vec_max_abs_fi_simd(x, len);
} }
// CP autocorr // CP autocorr

View File

@ -1092,6 +1092,63 @@ uint32_t srslte_vec_max_fi_simd(const float *x, const int len) {
return max_index; return max_index;
} }
uint32_t srslte_vec_max_abs_fi_simd(const float *x, const int len) {
int i = 0;
float max_value = -INFINITY;
uint32_t max_index = 0;
#if SRSLTE_SIMD_I_SIZE
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
if (SRSLTE_IS_ALIGNED(x)) {
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
simd_f_t a = srslte_simd_f_abs(srslte_simd_f_load(&x[i]));
simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
}
} else {
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
simd_f_t a = srslte_simd_f_abs(srslte_simd_f_loadu(&x[i]));
simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
}
}
srslte_simd_i_store(indexes_buffer, simd_max_indexes);
srslte_simd_f_store(values_buffer, simd_max_values);
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
if (values_buffer[k] > max_value) {
max_value = values_buffer[k];
max_index = (uint32_t) indexes_buffer[k];
}
}
#endif /* SRSLTE_SIMD_I_SIZE */
for (; i < len; i++) {
float a = fabsf(x[i]);
if (a > max_value) {
max_value = a;
max_index = (uint32_t)i;
}
}
return max_index;
}
uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len) { uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len) {
int i = 0; int i = 0;