From a300a47673d7d74dfda07f32331c32fc0d9ae365 Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Fri, 30 Jul 2021 15:47:51 +0200 Subject: [PATCH] Fix value selector for NEON --- lib/include/srsran/phy/utils/simd.h | 38 ++++++----------------------- lib/src/phy/utils/vector_simd.c | 12 ++++----- 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/lib/include/srsran/phy/utils/simd.h b/lib/include/srsran/phy/utils/simd.h index cddb5a1f6..289c4b675 100644 --- a/lib/include/srsran/phy/utils/simd.h +++ b/lib/include/srsran/phy/utils/simd.h @@ -1168,7 +1168,7 @@ typedef __m128 simd_sel_t; #else /* LV_HAVE_AVX2 */ #ifdef HAVE_NEON typedef int32x4_t simd_i_t; -typedef int32x4_t simd_sel_t; +typedef uint32x4_t simd_sel_t; #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1300,7 +1300,7 @@ static inline simd_sel_t srsran_simd_f_max(simd_f_t a, simd_f_t b) return (simd_sel_t)_mm_cmpgt_ps(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - return (simd_sel_t)vcgtq_f32(a, b); + return vcgtq_f32(a, b); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1319,7 +1319,7 @@ static inline simd_sel_t srsran_simd_f_min(simd_f_t a, simd_f_t b) return (simd_sel_t)_mm_cmplt_ps(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - return (simd_sel_t)vcltq_f32(a, b); + return vcltq_f32(a, b); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1337,20 +1337,8 @@ static inline simd_f_t srsran_simd_f_select(simd_f_t a, simd_f_t b, simd_sel_t s #ifdef LV_HAVE_SSE return _mm_blendv_ps(a, b, selector); #else /* LV_HAVE_SSE */ -#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON - float* a_ptr = (float*)&a; - float* b_ptr = (float*)&b; - simd_f_t ret; - int* sel = (int*)&selector; - float* c_ptr = (float*)&ret; - for (int i = 0; i < 4; i++) { - if (sel[i] == -1) { - c_ptr[i] = b_ptr[i]; - } else { - c_ptr[i] = a_ptr[i]; - } - } - return ret; +#ifdef HAVE_NEON + return vbslq_f32(selector, b, a); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1368,20 +1356,8 @@ static inline simd_i_t srsran_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s #ifdef LV_HAVE_SSE return (__m128i)_mm_blendv_ps((__m128)a, (__m128)b, selector); #else /* LV_HAVE_SSE */ -#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON - int* a_ptr = (int*)&a; - int* b_ptr = (int*)&b; - simd_i_t ret; - int* sel = (int*)&selector; - int* c_ptr = (int*)&ret; - for (int i = 0; i < 4; i++) { - if (sel[i] == -1) { - c_ptr[i] = b_ptr[i]; - } else { - c_ptr[i] = a_ptr[i]; - } - } - return ret; +#ifdef HAVE_NEON + return vbslq_s32(selector, b, a); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index c14a3a4f2..49e61c19f 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -1377,7 +1377,7 @@ uint32_t srsran_vec_max_fi_simd(const float* x, const int len) simd_f_t a = srsran_simd_f_load(&x[i]); simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); - simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); + simd_max_values = srsran_simd_f_select(simd_max_values, a, res); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); } } else { @@ -1385,7 +1385,7 @@ uint32_t srsran_vec_max_fi_simd(const float* x, const int len) simd_f_t a = srsran_simd_f_loadu(&x[i]); simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); - simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); + simd_max_values = srsran_simd_f_select(simd_max_values, a, res); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); } } @@ -1435,7 +1435,7 @@ uint32_t srsran_vec_max_abs_fi_simd(const float* x, const int len) simd_f_t a = srsran_simd_f_abs(srsran_simd_f_load(&x[i])); simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); - simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); + simd_max_values = srsran_simd_f_select(simd_max_values, a, res); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); } } else { @@ -1443,7 +1443,7 @@ uint32_t srsran_vec_max_abs_fi_simd(const float* x, const int len) simd_f_t a = srsran_simd_f_abs(srsran_simd_f_loadu(&x[i])); simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); - simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); + simd_max_values = srsran_simd_f_select(simd_max_values, a, res); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); } } @@ -1502,7 +1502,7 @@ uint32_t srsran_vec_max_ci_simd(const cf_t* x, const int len) simd_sel_t res = srsran_simd_f_max(z1, simd_max_values); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); - simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)z1, res); + simd_max_values = srsran_simd_f_select(simd_max_values, z1, res); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); } } else { @@ -1518,7 +1518,7 @@ uint32_t srsran_vec_max_ci_simd(const cf_t* x, const int len) simd_sel_t res = srsran_simd_f_max(z1, simd_max_values); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); - simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)z1, res); + simd_max_values = srsran_simd_f_select(simd_max_values, z1, res); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); } }