From f67524707d0daecfe9e93d91d0aa51c5e7479e93 Mon Sep 17 00:00:00 2001
From: Andre Puschmann <andre@softwareradiosystems.com>
Date: Wed, 6 May 2020 22:04:11 +0200
Subject: [PATCH] neon: add srslte_simd_b_neg()

clang complained about an uinit var that is returned and it turned
out that we don't even implement that function in NEON.

I've found a nice MIT-licensed header that was easy to integrate.
---
 lib/include/srslte/phy/utils/simd.h | 44 +++++++++++++++++++++++++----
 lib/src/phy/utils/vector_simd.c     |  2 --
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 6379ac968..75ac2813c 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -1548,9 +1548,25 @@ static inline simd_s_t srslte_simd_s_neg(simd_s_t a, simd_s_t b)
   return _mm_sign_epi16(a, b);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  simd_s_t res;
-  return res;
-  //#error sign instruction not available in Neon
+  /* Taken and modified from sse2neon.h licensed under MIT
+   * Source: https://github.com/DLTcollab/sse2neon
+   */
+  int16x8_t _a = vreinterpretq_s16_s32(a);
+  int16x8_t _b = vreinterpretq_s16_s32(b);
+
+  int16x8_t zero = vdupq_n_s16(0);
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFFFF : 0
+  uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(_b, 15));
+  // (b == 0) ? 0xFFFF : 0
+  int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(_b, zero));
+  // -a
+  int16x8_t neg = vnegq_s16(_a);
+  // bitwise select either a or neg based on ltMask
+  int16x8_t masked = vbslq_s16(ltMask, _a, neg);
+  // res = masked & (~zeroMask)
+  int16x8_t res = vbicq_s16(masked, zeroMask);
+  return vreinterpretq_s32_s16(res);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -2049,9 +2065,25 @@ static inline simd_s_t srslte_simd_b_neg(simd_b_t a, simd_b_t b)
   return _mm_sign_epi8(a, b);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  simd_s_t res;
-  return res;
-  //#error sign instruction not available in Neon
+  /* Taken and modified from sse2neon.h licensed under MIT
+   * Source: https://github.com/DLTcollab/sse2neon
+   */
+  int8x16_t _a = vreinterpretq_s8_s64(a);
+  int8x16_t _b = vreinterpretq_s8_s64(b);
+
+  int8x16_t zero = vdupq_n_s8(0);
+  // signed shift right: faster than vclt
+  // (b < 0) ? 0xFF : 0
+  uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(_b, 7));
+  // (b == 0) ? 0xFF : 0
+  int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(_b, zero));
+  // -a
+  int8x16_t neg = vnegq_s8(_a);
+  // bitwise select either a or neg based on ltMask
+  int8x16_t masked = vbslq_s8(ltMask, _a, neg);
+  // res = masked & (~zeroMask)
+  int8x16_t res = vbicq_s8(masked, zeroMask);
+  return vreinterpretq_s64_s8(res);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index 9740f8bb2..3127c3a66 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -223,7 +223,6 @@ void srslte_vec_neg_sss_simd(const int16_t* x, const int16_t* y, int16_t* z, con
 {
   int i = 0;
 
-#ifndef HAVE_NEON
 #if SRSLTE_SIMD_S_SIZE
   if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
     for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
@@ -245,7 +244,6 @@ void srslte_vec_neg_sss_simd(const int16_t* x, const int16_t* y, int16_t* z, con
     }
   }
 #endif /* SRSLTE_SIMD_S_SIZE */
-#endif /* NOT HAVE_NEON*/
 
   for (; i < len; i++) {
     z[i] = y[i] < 0 ? -x[i] : x[i];