From f67524707d0daecfe9e93d91d0aa51c5e7479e93 Mon Sep 17 00:00:00 2001 From: Andre Puschmann Date: Wed, 6 May 2020 22:04:11 +0200 Subject: [PATCH] neon: add srslte_simd_b_neg() clang complained about an uinit var that is returned and it turned out that we don't even implement that function in NEON. I've found a nice MIT-licensed header that was easy to integrate. --- lib/include/srslte/phy/utils/simd.h | 44 +++++++++++++++++++++++++---- lib/src/phy/utils/vector_simd.c | 2 -- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 6379ac968..75ac2813c 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -1548,9 +1548,25 @@ static inline simd_s_t srslte_simd_s_neg(simd_s_t a, simd_s_t b) return _mm_sign_epi16(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - simd_s_t res; - return res; - //#error sign instruction not available in Neon + /* Taken and modified from sse2neon.h licensed under MIT + * Source: https://github.com/DLTcollab/sse2neon + */ + int16x8_t _a = vreinterpretq_s16_s32(a); + int16x8_t _b = vreinterpretq_s16_s32(b); + + int16x8_t zero = vdupq_n_s16(0); + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(_b, 15)); + // (b == 0) ? 0xFFFF : 0 + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(_b, zero)); + // -a + int16x8_t neg = vnegq_s16(_a); + // bitwise select either a or neg based on ltMask + int16x8_t masked = vbslq_s16(ltMask, _a, neg); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_s32_s16(res); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -2049,9 +2065,25 @@ static inline simd_s_t srslte_simd_b_neg(simd_b_t a, simd_b_t b) return _mm_sign_epi8(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - simd_s_t res; - return res; - //#error sign instruction not available in Neon + /* Taken and modified from sse2neon.h licensed under MIT + * Source: https://github.com/DLTcollab/sse2neon + */ + int8x16_t _a = vreinterpretq_s8_s64(a); + int8x16_t _b = vreinterpretq_s8_s64(b); + + int8x16_t zero = vdupq_n_s8(0); + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(_b, 7)); + // (b == 0) ? 0xFF : 0 + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(_b, zero)); + // -a + int8x16_t neg = vnegq_s8(_a); + // bitwise select either a or neg based on ltMask + int8x16_t masked = vbslq_s8(ltMask, _a, neg); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + return vreinterpretq_s64_s8(res); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index 9740f8bb2..3127c3a66 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -223,7 +223,6 @@ void srslte_vec_neg_sss_simd(const int16_t* x, const int16_t* y, int16_t* z, con { int i = 0; -#ifndef HAVE_NEON #if SRSLTE_SIMD_S_SIZE if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { @@ -245,7 +244,6 @@ void srslte_vec_neg_sss_simd(const int16_t* x, const int16_t* y, int16_t* z, con } } #endif /* SRSLTE_SIMD_S_SIZE */ -#endif /* NOT HAVE_NEON*/ for (; i < len; i++) { z[i] = y[i] < 0 ? -x[i] : x[i];