Fix value selector for NEON

This commit is contained in:
Xavier Arteaga 2021-07-30 15:47:51 +02:00 committed by Xavier Arteaga
parent 99dc94ab38
commit a300a47673
2 changed files with 13 additions and 37 deletions

View File

@ -1168,7 +1168,7 @@ typedef __m128 simd_sel_t;
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef HAVE_NEON #ifdef HAVE_NEON
typedef int32x4_t simd_i_t; typedef int32x4_t simd_i_t;
typedef int32x4_t simd_sel_t; typedef uint32x4_t simd_sel_t;
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -1300,7 +1300,7 @@ static inline simd_sel_t srsran_simd_f_max(simd_f_t a, simd_f_t b)
return (simd_sel_t)_mm_cmpgt_ps(a, b); return (simd_sel_t)_mm_cmpgt_ps(a, b);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
return (simd_sel_t)vcgtq_f32(a, b); return vcgtq_f32(a, b);
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -1319,7 +1319,7 @@ static inline simd_sel_t srsran_simd_f_min(simd_f_t a, simd_f_t b)
return (simd_sel_t)_mm_cmplt_ps(a, b); return (simd_sel_t)_mm_cmplt_ps(a, b);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON #ifdef HAVE_NEON
return (simd_sel_t)vcltq_f32(a, b); return vcltq_f32(a, b);
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -1337,20 +1337,8 @@ static inline simd_f_t srsran_simd_f_select(simd_f_t a, simd_f_t b, simd_sel_t s
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_blendv_ps(a, b, selector); return _mm_blendv_ps(a, b, selector);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON #ifdef HAVE_NEON
float* a_ptr = (float*)&a; return vbslq_f32(selector, b, a);
float* b_ptr = (float*)&b;
simd_f_t ret;
int* sel = (int*)&selector;
float* c_ptr = (float*)&ret;
for (int i = 0; i < 4; i++) {
if (sel[i] == -1) {
c_ptr[i] = b_ptr[i];
} else {
c_ptr[i] = a_ptr[i];
}
}
return ret;
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -1368,20 +1356,8 @@ static inline simd_i_t srsran_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return (__m128i)_mm_blendv_ps((__m128)a, (__m128)b, selector); return (__m128i)_mm_blendv_ps((__m128)a, (__m128)b, selector);
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON #ifdef HAVE_NEON
int* a_ptr = (int*)&a; return vbslq_s32(selector, b, a);
int* b_ptr = (int*)&b;
simd_i_t ret;
int* sel = (int*)&selector;
int* c_ptr = (int*)&ret;
for (int i = 0; i < 4; i++) {
if (sel[i] == -1) {
c_ptr[i] = b_ptr[i];
} else {
c_ptr[i] = a_ptr[i];
}
}
return ret;
#endif /* HAVE_NEON */ #endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */

View File

@ -1377,7 +1377,7 @@ uint32_t srsran_vec_max_fi_simd(const float* x, const int len)
simd_f_t a = srsran_simd_f_load(&x[i]); simd_f_t a = srsran_simd_f_load(&x[i]);
simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_sel_t res = srsran_simd_f_max(a, simd_max_values);
simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); simd_max_values = srsran_simd_f_select(simd_max_values, a, res);
simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc);
} }
} else { } else {
@ -1385,7 +1385,7 @@ uint32_t srsran_vec_max_fi_simd(const float* x, const int len)
simd_f_t a = srsran_simd_f_loadu(&x[i]); simd_f_t a = srsran_simd_f_loadu(&x[i]);
simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_sel_t res = srsran_simd_f_max(a, simd_max_values);
simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); simd_max_values = srsran_simd_f_select(simd_max_values, a, res);
simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc);
} }
} }
@ -1435,7 +1435,7 @@ uint32_t srsran_vec_max_abs_fi_simd(const float* x, const int len)
simd_f_t a = srsran_simd_f_abs(srsran_simd_f_load(&x[i])); simd_f_t a = srsran_simd_f_abs(srsran_simd_f_load(&x[i]));
simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_sel_t res = srsran_simd_f_max(a, simd_max_values);
simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); simd_max_values = srsran_simd_f_select(simd_max_values, a, res);
simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc);
} }
} else { } else {
@ -1443,7 +1443,7 @@ uint32_t srsran_vec_max_abs_fi_simd(const float* x, const int len)
simd_f_t a = srsran_simd_f_abs(srsran_simd_f_loadu(&x[i])); simd_f_t a = srsran_simd_f_abs(srsran_simd_f_loadu(&x[i]));
simd_sel_t res = srsran_simd_f_max(a, simd_max_values); simd_sel_t res = srsran_simd_f_max(a, simd_max_values);
simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)a, res); simd_max_values = srsran_simd_f_select(simd_max_values, a, res);
simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc);
} }
} }
@ -1502,7 +1502,7 @@ uint32_t srsran_vec_max_ci_simd(const cf_t* x, const int len)
simd_sel_t res = srsran_simd_f_max(z1, simd_max_values); simd_sel_t res = srsran_simd_f_max(z1, simd_max_values);
simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)z1, res); simd_max_values = srsran_simd_f_select(simd_max_values, z1, res);
simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc);
} }
} else { } else {
@ -1518,7 +1518,7 @@ uint32_t srsran_vec_max_ci_simd(const cf_t* x, const int len)
simd_sel_t res = srsran_simd_f_max(z1, simd_max_values); simd_sel_t res = srsran_simd_f_max(z1, simd_max_values);
simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_indexes = srsran_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t)srsran_simd_i_select((simd_i_t)simd_max_values, (simd_i_t)z1, res); simd_max_values = srsran_simd_f_select(simd_max_values, z1, res);
simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc); simd_indexes = srsran_simd_i_add(simd_indexes, simd_inc);
} }
} }