From 2019ca31eff3b7190cb6b2f3cf53734f95be9486 Mon Sep 17 00:00:00 2001 From: yagoda Date: Fri, 13 Oct 2017 15:35:48 +0100 Subject: [PATCH 1/4] adding neon support for new kernel structure --- lib/include/srslte/phy/utils/simd.h | 319 +++++++++++++++++++++++++--- 1 file changed, 292 insertions(+), 27 deletions(-) diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 08eed115f..6e4185788 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -121,7 +121,17 @@ #define SRSLTE_SIMD_C16_SIZE 8 #else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON +#define SRSLTE_SIMD_F_SIZE 4 +#define SRSLTE_SIMD_CF_SIZE 4 + +#define SRSLTE_SIMD_I_SIZE 4 + +#define SRSLTE_SIMD_S_SIZE 8 +#define SRSLTE_SIMD_C16_SIZE 8 + +#else /* LV_HAVE_NEON */ #define SRSLTE_SIMD_F_SIZE 0 #define SRSLTE_SIMD_CF_SIZE 0 @@ -147,6 +157,10 @@ typedef __m256 simd_f_t; #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE typedef __m128 simd_f_t; +#else /* HAVE_NEON */ +#ifdef HAVE_NEON +typedef float32x4 simd_f_t; +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -161,6 +175,10 @@ static inline simd_f_t srslte_simd_f_load(float *ptr) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_load_ps(ptr); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vld1q_f32(ptr); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -175,12 +193,16 @@ static inline simd_f_t srslte_simd_f_loadu(float *ptr) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_loadu_ps(ptr); +#else /* LV_HAVE_SSE */ + #ifdef HAVE_NEON + return vld1q_f32(ptr); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ } -static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) { +static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {vst1q_f32 #ifdef LV_HAVE_AVX512 _mm512_store_ps(ptr, simdreg); #else /* LV_HAVE_AVX512 */ @@ -189,6 +211,10 @@ static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE _mm_store_ps(ptr, simdreg); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + vst1q_f32(ptr, simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -203,6 +229,10 @@ static inline void srslte_simd_f_storeu(float *ptr, simd_f_t simdreg) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE _mm_storeu_ps(ptr, simdreg); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + vst1q_f32(ptr, simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -217,6 +247,10 @@ static inline simd_f_t srslte_simd_f_set1(float x) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_set1_ps(x); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vdupq_n_f32(x); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -231,6 +265,10 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_mul_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vmulq_f32(a,b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -245,6 +283,10 @@ static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_rcp_ps(a); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vrecpeq_f32(a); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -265,6 +307,9 @@ static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) { #endif /* LV_HAVE_AVX512 */ } + + + static inline simd_f_t srslte_simd_f_sub(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX512 return _mm512_sub_ps(a, b); @@ -274,6 +319,10 @@ static inline simd_f_t srslte_simd_f_sub(simd_f_t a, simd_f_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_sub_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vsubq_f32(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -286,8 +335,12 @@ static inline simd_f_t srslte_simd_f_add(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX2 return _mm256_add_ps(a, b); #else /* LV_HAVE_AVX2 */ -#ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_add_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vaddq_f32(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -300,8 +353,12 @@ static inline simd_f_t srslte_simd_f_zero (void) { #ifdef LV_HAVE_AVX2 return _mm256_setzero_ps(); #else /* LV_HAVE_AVX2 */ -#ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_setzero_ps(); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vdupq_n_f32(0); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -344,6 +401,10 @@ static inline simd_f_t srslte_simd_f_hadd(simd_f_t a, simd_f_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_hadd_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) ); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -358,6 +419,10 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_sqrt_ps(a); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vrecpeq_f32(vrsqrteq_f32(a)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -368,10 +433,15 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { #if SRSLTE_SIMD_CF_SIZE +#ifdef HAVE_NEON + typedef float32x4x2_t simd_cf_t; +#else typedef struct { simd_f_t re; simd_f_t im; + } simd_cf_t; +#endif /* Complex Single precission Floating point functions */ static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) { @@ -399,6 +469,10 @@ static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) { __m128 i2 = _mm_load_ps((float*)(ptr + 2)); ret.re = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(2,0,2,0)); ret.im = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(3,1,3,1)); +#else +#ifdef HAVE_NEON + ret = vld2q_f32((float*)(ptr)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -431,6 +505,10 @@ static inline simd_cf_t srslte_simd_cfi_loadu(cf_t *ptr) { __m128 i2 = _mm_loadu_ps((float*)(ptr + 2)); ret.re = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(2,0,2,0)); ret.im = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(3,1,3,1)); +#else +#ifdef HAVE_NEON + ret = vld2q_f32((float*)(ptr)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -450,6 +528,11 @@ static inline simd_cf_t srslte_simd_cf_load(float *re, float *im) { #ifdef LV_HAVE_SSE ret.re = _mm_load_ps(re); ret.im = _mm_load_ps(im); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + ret.val[0] = vld1q_f32(ptr); + ret.val[1] = vld1q_f32(ptr); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -469,6 +552,11 @@ static inline simd_cf_t srslte_simd_cf_loadu(float *re, float *im) { #ifdef LV_HAVE_SSE ret.re = _mm_loadu_ps(re); ret.im = _mm_loadu_ps(im); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + ret.val[0] = vld1q_f32(ptr); + ret.val[1] = vld1q_f32(ptr); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -497,6 +585,10 @@ static inline void srslte_simd_cfi_store(cf_t *ptr, simd_cf_t simdreg) { #ifdef LV_HAVE_SSE _mm_store_ps((float*)(ptr), _mm_unpacklo_ps(simdreg.re, simdreg.im)); _mm_store_ps((float*)(ptr + 2), _mm_unpackhi_ps(simdreg.re, simdreg.im)); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + vst2q_f32((float*)(ptr), simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -524,6 +616,10 @@ static inline void srslte_simd_cfi_storeu(cf_t *ptr, simd_cf_t simdreg) { #ifdef LV_HAVE_SSE _mm_storeu_ps((float*)(ptr), _mm_unpacklo_ps(simdreg.re, simdreg.im)); _mm_storeu_ps((float*)(ptr + 2), _mm_unpackhi_ps(simdreg.re, simdreg.im)); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + vst2q_f32((float*)(ptr), simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -541,6 +637,11 @@ static inline void srslte_simd_cf_store(float *re, float *im, simd_cf_t simdreg) #ifdef LV_HAVE_SSE _mm_store_ps((float *) re, simdreg.re); _mm_store_ps((float *) im, simdreg.im); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + vst1q_f32((float *) re, simdreg.val[0]); + vst1q_f32((float *) im, simdreg.val[1]); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -558,6 +659,11 @@ static inline void srslte_simd_cf_storeu(float *re, float *im, simd_cf_t simdreg #ifdef LV_HAVE_SSE _mm_storeu_ps((float *) re, simdreg.re); _mm_storeu_ps((float *) im, simdreg.im); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + vst1q_f32((float *) re, simdreg.val[0]); + vst1q_f32((float *) im, simdreg.val[1]); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -576,6 +682,11 @@ static inline simd_cf_t srslte_simd_cf_set1 (cf_t x) { #ifdef LV_HAVE_SSE ret.re = _mm_set1_ps(__real__ x); ret.im = _mm_set1_ps(__imag__ x); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + re.val[0] = vdupq_n_f32(__real__ x); + im.val[1] = vdupq_n_f32(__imag__ x); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -601,6 +712,13 @@ static inline simd_cf_t srslte_simd_cf_prod (simd_cf_t a, simd_cf_t b) { _mm_mul_ps(a.im, b.im)); ret.im = _mm_add_ps(_mm_mul_ps(a.re, b.im), _mm_mul_ps(a.im, b.re)); +#else +#ifdef HAVE_NEON + ret.val[0] = vsubq_f32(vmulq_f32(a.val[0],b.val[0]), + vmulq_f32(a.val[1],b.val[1])); + ret.val[1] = vaddq_f32(vmulq_f32(a.val[0],b.val[1]), + vmulq_f32(a.val[1],b.val[0])); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -626,6 +744,13 @@ static inline simd_cf_t srslte_simd_cf_conjprod (simd_cf_t a, simd_cf_t b) { _mm_mul_ps(a.im, b.im)); ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re), _mm_mul_ps(a.re, b.im)); + #else +#ifdef HAVE_NEON + ret.val[0] = vaddq_f32(vmulq_f32(a.val[0],b.val[0]), + vmulq_f32(a.val[1],b.val[1])); + ret.val[1] = vsubq_f32(vmulq_f32(a.val[1],b.val[0]), + vmulq_f32(a.val[0],b.val[1])); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -645,6 +770,11 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) { #ifdef LV_HAVE_SSE ret.re = _mm_add_ps(a.re, b.re); ret.im = _mm_add_ps(a.im, b.im); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + ret.val[0] = vaddq_f32(a.val[0],a.val[0]); + ret.val[1] = vaddq_f32(a.val[1],a.val[1]); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -665,6 +795,11 @@ static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) { #ifdef LV_HAVE_SSE ret.re = _mm_mul_ps(a.re, b); ret.im = _mm_mul_ps(a.im, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + ret.val[0] = vmulq_f32(a.val[0],b); + ret.val[1] = vmulq_f32(a.val[1],b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -699,6 +834,16 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im); ret.re = _mm_mul_ps(a.re, rcp); ret.im = _mm_mul_ps(neg_a_im, rcp); + #else /* LV_HAVE_SSE */ + #ifdef HAVE_NEON + simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]); + simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]); + simd_f_t mod2 = vaddq_f32(a2re, a2im); + simd_f_t rcp = vrecpeq_f32(mod2); + simd_f_t neg_a_im = vnegq_f32(vdupq_n_f32(-0.0f), a.val[1]); + ret.val[0] = vmulq_f32(a.val[0], rcp); + ret.val[1] = vmulq_f32(neg_a_im, rcp); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -718,7 +863,11 @@ static inline simd_cf_t srslte_simd_cf_zero (void) { #ifdef LV_HAVE_SSE ret.re = _mm_setzero_ps(); ret.im = _mm_setzero_ps(); -#endif /* LV_HAVE_SSE */ +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + ret.val[0] = vdupq_n_f32(0); + ret.val[1] = vdupq_n_f32(0); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ return ret; @@ -739,6 +888,11 @@ typedef __m256 simd_sel_t; #ifdef LV_HAVE_SSE typedef __m128i simd_i_t; typedef __m128 simd_sel_t; +#else /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_SSE +typedef int32x4_t simd_i_t; +typedef __m128 simd_sel_t; +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -752,6 +906,10 @@ static inline simd_i_t srslte_simd_i_load(int *x) { #else #ifdef LV_HAVE_SSE return _mm_load_si128((__m128i*)x); +#else + #ifdef HAVE_NEON + return vld1_s32((int32x4_t*)x); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -780,6 +938,10 @@ static inline simd_i_t srslte_simd_i_set1(int x) { #else #ifdef LV_HAVE_SSE return _mm_set1_epi32(x); +#else + #ifdef HAVE_NEON + return vdupq_n_s32(x); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -794,10 +956,14 @@ static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) { #else #ifdef LV_HAVE_SSE return _mm_add_epi32(a, b); +#else +#ifdef HAVE_NEON + return vaddq_s32(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ -} +}vcgtq_f32 static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX512 @@ -808,6 +974,10 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return (simd_sel_t) _mm_cmpgt_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return (simd_sel_t) vcgtq_f32(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -841,6 +1011,10 @@ typedef __m256i simd_s_t; #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE typedef __m128i simd_s_t; +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON +typedef int16x8_t simd_s_t; +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -854,6 +1028,10 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_load_si128((__m128i*) ptr); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vld1q_s16((int16x8_t*) ptr); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -867,7 +1045,11 @@ static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) { return _mm256_loadu_si256((__m256i*) ptr); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE - return _mm_loadu_si128((__m128i*) ptr); + return _mm_loadu_si128((__m128i*) ptr) +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vld1q_s16((int16x8_t*) ptr); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -882,6 +1064,10 @@ static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE _mm_store_si128((__m128i*) ptr, simdreg); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + vst1q_s16((int16x8_t*) ptr, simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -896,11 +1082,15 @@ static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE _mm_storeu_si128((__m128i*) ptr, simdreg); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + vst1q_s16((int16x8_t*) ptr, simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ } - +vdupq_n_s16 static inline simd_s_t srslte_simd_s_zero(void) { #ifdef LV_HAVE_AVX512 return _mm512_setzero_si512(); @@ -910,10 +1100,14 @@ static inline simd_s_t srslte_simd_s_zero(void) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_setzero_si128(); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vdupq_n_s16(0); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ -} +}vmulq_s16 static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) { #ifdef LV_HAVE_AVX512 @@ -924,6 +1118,10 @@ static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_mullo_epi16(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vmulq_s16(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -938,6 +1136,10 @@ static inline simd_s_t srslte_simd_s_add(simd_s_t a, simd_s_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_add_epi16(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vaddq_s16(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -952,6 +1154,10 @@ static inline simd_s_t srslte_simd_s_sub(simd_s_t a, simd_s_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_sub_epi16(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vsubq_s16(a, b); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -962,8 +1168,9 @@ static inline simd_s_t srslte_simd_s_sub(simd_s_t a, simd_s_t b) { #if SRSLTE_SIMD_C16_SIZE -typedef struct { +typedef #ifdef LV_HAVE_AVX512 + struct { union { __m512i m512; int16_t i16[32]; @@ -974,24 +1181,32 @@ typedef struct { } im; #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX2 - union { - __m256i m256; - int16_t i16[16]; - } re; - union { - __m256i m256; - int16_t i16[16]; - } im; + struct { + union { + __m256i m256; + int16_t i16[16]; + } re; + union { + __m256i m256; + int16_t i16[16]; + } im; #else #ifdef LV_HAVE_SSE - union { - __m128i m128; - int16_t i16[8]; - } re; - union { - __m128i m128; - int16_t i16[8]; - } im; + struct { + union { + __m128i m128; + int16_t i16[8]; + } re; + union { + __m128i m128; + int16_t i16[8]; + } im; +#else +#ifdef HAVE_NEON + union { + int16x8x2_t m128; + int16_t i16[16]; +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -1017,6 +1232,10 @@ static inline simd_c16_t srslte_simd_c16i_load(c16_t *ptr) { __m128i in2 = _mm_load_si128((__m128i*)(ptr + 8)); ret.re.m128 = _mm_blend_epi16(in1,_mm_shufflelo_epi16(_mm_shufflehi_epi16(in2, 0b10100000), 0b10100000), 0b10101010); ret.im.m128 = _mm_blend_epi16(_mm_shufflelo_epi16(_mm_shufflehi_epi16(in1, 0b11110101), 0b11110101),in2, 0b10101010); +#else /* LV_HAVE_SSE*/ +#ifdef HAVE_NEON + ret.m128 = vld2q_s16((int16_t*)(ptr)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -1032,6 +1251,11 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) { #ifdef LV_HAVE_SSE ret.re.m128 = _mm_load_si128((__m128i*)(re)); ret.im.m128 = _mm_load_si128((__m128i*)(im)); +#else /* LV_HAVE_SSE*/ +#ifdef HAVE_NEON + ret.m128.val[0] = vld1q_s16((int16_t*)(re)); + ret.m128.val[1] = vld1q_s16((int16_t*)(im)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ return ret; @@ -1046,6 +1270,11 @@ static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) { #ifdef LV_HAVE_SSE ret.re.m128 = _mm_loadu_si128((__m128i*)(re)); ret.im.m128 = _mm_loadu_si128((__m128i*)(im)); +#else /* LV_HAVE_SSE*/ +#ifdef HAVE_NEON + ret.m128.val[0] = vld1q_s16((int16_t*)(re)); + ret.m128.val[1] = vld1q_s16((int16_t*)(im)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ return ret; @@ -1063,6 +1292,10 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) { __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001); _mm_store_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010)); _mm_store_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010)); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + vst2q_f32((float*)(ptr) ,simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ } @@ -1079,6 +1312,10 @@ static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) { __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001); _mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010)); _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010)); +#else /*HAVE_NEON*/ +#ifdef HAVE_NEON + vst2q_f32((float*)(ptr) ,simdreg); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ } @@ -1091,6 +1328,11 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si #ifdef LV_HAVE_SSE _mm_store_si128((__m128i *) re, simdreg.re.m128); _mm_store_si128((__m128i *) im, simdreg.im.m128); +#else +#ifdef HAVE_NEON + vst1q_f32((int16_t *) re, simdreg.m128.val[0]); + vst1q_f32((int16_t *) im, simdreg.m128.val[1]); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ } @@ -1103,10 +1345,17 @@ static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t s #ifdef LV_HAVE_SSE _mm_storeu_si128((__m128i *) re, simdreg.re.m128); _mm_storeu_si128((__m128i *) im, simdreg.im.m128); +#else +#ifdef HAVE_NEON + vst1q_f32((int16_t *) re, simdreg.m128.val[0]); + vst1q_f32((int16_t *) im, simdreg.m128.val[1]); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ } + +//TODO static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) { simd_c16_t ret; #ifdef LV_HAVE_AVX2 @@ -1134,11 +1383,16 @@ static inline simd_c16_t srslte_simd_c16_add (simd_c16_t a, simd_c16_t b) { #ifdef LV_HAVE_SSE ret.re.m128 = _mm_add_epi16(a.re.m128, b.re.m128); ret.im.m128 = _mm_add_epi16(a.im.m128, b.im.m128); +#else +#ifdef HAVE_NEON + ret.m128.val[0] = vaddq_s32(a.m127.val[0],a.m127.val[0]); + ret.m128.val[1] = vaddq_s32(a.m127.val[1],a.m127.val[1]); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ return ret; } - +vdupq_n_s16 static inline simd_c16_t srslte_simd_c16_zero (void) { simd_c16_t ret; #ifdef LV_HAVE_AVX2 @@ -1148,7 +1402,12 @@ static inline simd_c16_t srslte_simd_c16_zero (void) { #ifdef LV_HAVE_SSE ret.re.m128 = _mm_setzero_si128(); ret.im.m128 = _mm_setzero_si128(); -#endif /* LV_HAVE_SSE */ +#else +#ifdef HAVE_NEON + ret.m128.val[0] = vdupq_n_s16(0); + ret.m128.val[1] = vdupq_n_s16(0); +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ return ret; } @@ -1182,6 +1441,12 @@ static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) { __m128i ai = _mm_cvttps_epi32(a); __m128i bi = _mm_cvttps_epi32(b); return _mm_packs_epi32(ai, bi); + #else +#ifdef HAVE_NEON + int32x4_t ai = vcvtq_s32_f32(a); + int32x4_t bi = vcvtq_s32_f32(b); + return (simd_s_t)vcombine_s16(vqmovn_s32(ai), vqmovn_s32(bi)); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ From f4b9e7311a1daa4b306c9e8bebd3c956db3e6cad Mon Sep 17 00:00:00 2001 From: yagoda Date: Tue, 17 Oct 2017 15:51:27 +0000 Subject: [PATCH 2/4] adding neon support to new vector structure --- lib/include/srslte/phy/utils/simd.h | 123 +++++++++++++++++++--------- lib/src/phy/utils/vector_simd.c | 37 +++++++-- 2 files changed, 116 insertions(+), 44 deletions(-) diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 6e4185788..e7820c307 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -33,6 +33,11 @@ #endif #include #endif /* LV_HAVE_SSE */ +#include + +#ifdef HAVE_NEON +#include +#endif /* * SSE Macros @@ -140,6 +145,7 @@ #define SRSLTE_SIMD_S_SIZE 0 #define SRSLTE_SIMD_C16_SIZE 0 +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -159,7 +165,7 @@ typedef __m256 simd_f_t; typedef __m128 simd_f_t; #else /* HAVE_NEON */ #ifdef HAVE_NEON -typedef float32x4 simd_f_t; +typedef float32x4_t simd_f_t; #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -188,7 +194,7 @@ static inline simd_f_t srslte_simd_f_loadu(float *ptr) { #ifdef LV_HAVE_AVX512 return _mm512_loadu_ps(ptr); #else /* LV_HAVE_AVX512 */ - #ifdef LV_HAVE_AVX2 + #ifdef LV_HAVE_AVX2 return _mm256_loadu_ps(ptr); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE @@ -202,7 +208,7 @@ static inline simd_f_t srslte_simd_f_loadu(float *ptr) { #endif /* LV_HAVE_AVX512 */ } -static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {vst1q_f32 +static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) { #ifdef LV_HAVE_AVX512 _mm512_store_ps(ptr, simdreg); #else /* LV_HAVE_AVX512 */ @@ -281,11 +287,11 @@ static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) { #ifdef LV_HAVE_AVX2 return _mm256_rcp_ps(a); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_rcp_ps(a); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - return vrecpeq_f32(a); + return vmulq_f32(vrecpeq_f32(a), vrecpsq_f32(vrecpeq_f32(a), a)); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -302,6 +308,22 @@ static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_addsub_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON + float* a_ptr = &a; + float* b_ptr = &b; + simd_f_t ret; + float* c_ptr = &ret; + for(int i = 0; i<4;i++){ + if(i%2==0){ + c_ptr[i] = a_ptr[i] - b_ptr[i]; + }else{ + c_ptr[i] = a_ptr[i] + b_ptr[i]; + } + } + + return ret; +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -373,6 +395,10 @@ static inline simd_f_t srslte_simd_f_swap(simd_f_t a) { #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE return _mm_shuffle_ps(a, a, 0b10110001); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(a))); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -421,7 +447,9 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { return _mm_sqrt_ps(a); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - return vrecpeq_f32(vrsqrteq_f32(a)); + float32x4_t sqrt_reciprocal = vrsqrteq_f32(a); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a,sqrt_reciprocal), sqrt_reciprocal),sqrt_reciprocal); + return vmulq_f32(a,sqrt_reciprocal); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -530,8 +558,8 @@ static inline simd_cf_t srslte_simd_cf_load(float *re, float *im) { ret.im = _mm_load_ps(im); #else /*HAVE_NEON*/ #ifdef HAVE_NEON - ret.val[0] = vld1q_f32(ptr); - ret.val[1] = vld1q_f32(ptr); + ret.val[0] = vld1q_f32(re); + ret.val[1] = vld1q_f32(im); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -554,8 +582,8 @@ static inline simd_cf_t srslte_simd_cf_loadu(float *re, float *im) { ret.im = _mm_loadu_ps(im); #else /*HAVE_NEON*/ #ifdef HAVE_NEON - ret.val[0] = vld1q_f32(ptr); - ret.val[1] = vld1q_f32(ptr); + ret.val[0] = vld1q_f32(re); + ret.val[1] = vld1q_f32(im); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -684,8 +712,8 @@ static inline simd_cf_t srslte_simd_cf_set1 (cf_t x) { ret.im = _mm_set1_ps(__imag__ x); #else /*HAVE_NEON*/ #ifdef HAVE_NEON - re.val[0] = vdupq_n_f32(__real__ x); - im.val[1] = vdupq_n_f32(__imag__ x); + ret.val[0] = vdupq_n_f32(__real__ x); + ret.val[1] = vdupq_n_f32(__imag__ x); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -772,8 +800,8 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) { ret.im = _mm_add_ps(a.im, b.im); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - ret.val[0] = vaddq_f32(a.val[0],a.val[0]); - ret.val[1] = vaddq_f32(a.val[1],a.val[1]); + ret.val[0] = vaddq_f32(a.val[0],b.val[0]); + ret.val[1] = vaddq_f32(a.val[1],b.val[1]); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -839,8 +867,8 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]); simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]); simd_f_t mod2 = vaddq_f32(a2re, a2im); - simd_f_t rcp = vrecpeq_f32(mod2); - simd_f_t neg_a_im = vnegq_f32(vdupq_n_f32(-0.0f), a.val[1]); + simd_f_t rcp = vmulq_f32(vrecpeq_f32(mod2), vrecpsq_f32(vrecpeq_f32(mod2), mod2)); + simd_f_t neg_a_im = vnegq_f32(a.val[1]); ret.val[0] = vmulq_f32(a.val[0], rcp); ret.val[1] = vmulq_f32(neg_a_im, rcp); #endif /* HAVE_NEON */ @@ -868,6 +896,7 @@ static inline simd_cf_t srslte_simd_cf_zero (void) { ret.val[0] = vdupq_n_f32(0); ret.val[1] = vdupq_n_f32(0); #endif /* HAVE_NEON */ +#endif /* HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ return ret; @@ -889,9 +918,9 @@ typedef __m256 simd_sel_t; typedef __m128i simd_i_t; typedef __m128 simd_sel_t; #else /* LV_HAVE_AVX2 */ -#ifdef LV_HAVE_SSE +#ifdef HAVE_NEON typedef int32x4_t simd_i_t; -typedef __m128 simd_sel_t; +typedef int32x4_t simd_sel_t; #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -908,7 +937,7 @@ static inline simd_i_t srslte_simd_i_load(int *x) { return _mm_load_si128((__m128i*)x); #else #ifdef HAVE_NEON - return vld1_s32((int32x4_t*)x); + return vld1q_s32((int*)x); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -924,6 +953,10 @@ static inline void srslte_simd_i_store(int *x, simd_i_t reg) { #else #ifdef LV_HAVE_SSE _mm_store_si128((__m128i*)x, reg); +#else +#ifdef HAVE_NEON + vst1q_s32((int*)x, reg); +#endif /*HAVE_NEON*/ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -963,7 +996,7 @@ static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) { #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ -}vcgtq_f32 +} static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX512 @@ -992,6 +1025,25 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s #else #ifdef LV_HAVE_SSE return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON + + int* a_ptr = &a; + int* b_ptr = &b; + simd_i_t ret; + int* sel = &selector; + + int* c_ptr = &ret; + for(int i = 0;i<4;i++) + { + if(sel[i] == -1){ + c_ptr[i] = b_ptr[i]; + }else{ + c_ptr[i] = a_ptr[i]; + } + } + return ret; +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -1030,7 +1082,7 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) { return _mm_load_si128((__m128i*) ptr); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - return vld1q_s16((int16x8_t*) ptr); + return vld1q_s16(ptr); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1048,7 +1100,7 @@ static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) { return _mm_loadu_si128((__m128i*) ptr) #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - return vld1q_s16((int16x8_t*) ptr); + return vld1q_s16(ptr); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1066,7 +1118,7 @@ static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) { _mm_store_si128((__m128i*) ptr, simdreg); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - vst1q_s16((int16x8_t*) ptr, simdreg); + vst1q_s16( ptr, simdreg); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1084,13 +1136,12 @@ static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) { _mm_storeu_si128((__m128i*) ptr, simdreg); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON - vst1q_s16((int16x8_t*) ptr, simdreg); + vst1q_s16(ptr, simdreg); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ } -vdupq_n_s16 static inline simd_s_t srslte_simd_s_zero(void) { #ifdef LV_HAVE_AVX512 return _mm512_setzero_si512(); @@ -1107,7 +1158,7 @@ static inline simd_s_t srslte_simd_s_zero(void) { #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ -}vmulq_s16 +} static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) { #ifdef LV_HAVE_AVX512 @@ -1294,7 +1345,7 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) { _mm_store_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010)); #else /*HAVE_NEON*/ #ifdef HAVE_NEON - vst2q_f32((float*)(ptr) ,simdreg); + vst2q_s16((int16_t*)(ptr) ,simdreg.m128); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1314,7 +1365,7 @@ static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) { _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010)); #else /*HAVE_NEON*/ #ifdef HAVE_NEON - vst2q_f32((float*)(ptr) ,simdreg); + vst2q_s16((int16_t*)(ptr) ,simdreg.m128); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1330,8 +1381,8 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si _mm_store_si128((__m128i *) im, simdreg.im.m128); #else #ifdef HAVE_NEON - vst1q_f32((int16_t *) re, simdreg.m128.val[0]); - vst1q_f32((int16_t *) im, simdreg.m128.val[1]); + vst1q_s16((int16_t *) re, simdreg.m128.val[0]); + vst1q_s16((int16_t *) im, simdreg.m128.val[1]); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ @@ -1347,15 +1398,13 @@ static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t s _mm_storeu_si128((__m128i *) im, simdreg.im.m128); #else #ifdef HAVE_NEON - vst1q_f32((int16_t *) re, simdreg.m128.val[0]); - vst1q_f32((int16_t *) im, simdreg.m128.val[1]); + vst1q_s16((int16_t *) re, simdreg.m128.val[0]); + vst1q_s16((int16_t *) im, simdreg.m128.val[1]); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ } - -//TODO static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) { simd_c16_t ret; #ifdef LV_HAVE_AVX2 @@ -1385,14 +1434,14 @@ static inline simd_c16_t srslte_simd_c16_add (simd_c16_t a, simd_c16_t b) { ret.im.m128 = _mm_add_epi16(a.im.m128, b.im.m128); #else #ifdef HAVE_NEON - ret.m128.val[0] = vaddq_s32(a.m127.val[0],a.m127.val[0]); - ret.m128.val[1] = vaddq_s32(a.m127.val[1],a.m127.val[1]); + ret.m128.val[0] = vaddq_s16(a.m128.val[0],a.m128.val[0]); + ret.m128.val[1] = vaddq_s16(a.m128.val[1],a.m128.val[1]); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ return ret; } -vdupq_n_s16 + static inline simd_c16_t srslte_simd_c16_zero (void) { simd_c16_t ret; #ifdef LV_HAVE_AVX2 diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index 0294bd1af..ab281a653 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -751,10 +751,37 @@ void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) { } } + + +int srslte_vec_sc_prod_ccc_simd2(cf_t *x, cf_t h, cf_t *z, int len) +{ + int i = 0; + const unsigned int loops = len / 4; +#ifdef HAVE_NEON + simd_cf_t h_vec; + h_vec.val[0] = srslte_simd_f_set1(__real__ h); + h_vec.val[1] = srslte_simd_f_set1(__imag__ h); + for (; i < loops; i++) { + + simd_cf_t in = srslte_simd_cfi_load(&x[i*4]); + simd_cf_t temp = srslte_simd_cf_prod(in, h_vec); + srslte_simd_cfi_store(&z[i*4], temp); + } + +#endif + i = loops * 4; +return i; +} + void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) { int i = 0; #if SRSLTE_SIMD_F_SIZE + + +#ifdef HAVE_NEON + i = srslte_vec_sc_prod_ccc_simd2(x, h, z, len); +#else const simd_f_t hre = srslte_simd_f_set1(__real__ h); const simd_f_t him = srslte_simd_f_set1(__imag__ h); @@ -766,8 +793,8 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) { simd_f_t sw = srslte_simd_f_swap(temp); simd_f_t m2 = srslte_simd_f_mul(him, sw); simd_f_t r = srslte_simd_f_addsub(m1, m2); - srslte_simd_f_store((float *) &z[i], r); + } } else { for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) { @@ -782,10 +809,11 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) { } } #endif - +#endif for (; i < len; i++) { z[i] = x[i] * h; } + } void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len) { @@ -831,7 +859,6 @@ void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len) { simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2); z1 = srslte_simd_f_sqrt(z1); - srslte_simd_f_store(&z[i], z1); } } else { @@ -966,9 +993,7 @@ uint32_t srslte_vec_max_fi_simd(float *x, int len) { if (SRSLTE_IS_ALIGNED(x)) { for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) { simd_f_t a = srslte_simd_f_load(&x[i]); - simd_sel_t res = srslte_simd_f_max(a, simd_max_values); - simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res); simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc); @@ -976,9 +1001,7 @@ uint32_t srslte_vec_max_fi_simd(float *x, int len) { } else { for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) { simd_f_t a = srslte_simd_f_loadu(&x[i]); - simd_sel_t res = srslte_simd_f_max(a, simd_max_values); - simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res); simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res); simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc); From 0504e7a51b42288cd9e7f9b113b7d8bc4cc2f30d Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Wed, 18 Oct 2017 12:49:43 +0200 Subject: [PATCH 3/4] Fixed test for abs value. Solved compilation Neon warnings and SSE errors --- lib/include/srslte/phy/utils/simd.h | 16 +++++++-------- lib/src/phy/utils/test/vector_test.c | 30 ++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index e7820c307..0c378591e 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -310,10 +310,10 @@ static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) { return _mm_addsub_ps(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON - float* a_ptr = &a; - float* b_ptr = &b; + float* a_ptr = (float*) &a; + float* b_ptr = (float*) &b; simd_f_t ret; - float* c_ptr = &ret; + float* c_ptr = (float*) &ret; for(int i = 0; i<4;i++){ if(i%2==0){ c_ptr[i] = a_ptr[i] - b_ptr[i]; @@ -1028,12 +1028,12 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON - int* a_ptr = &a; - int* b_ptr = &b; + int* a_ptr = (int*) &a; + int* b_ptr = (int*) &b; simd_i_t ret; - int* sel = &selector; + int* sel = (int*) &selector; - int* c_ptr = &ret; + int* c_ptr = (int*) &ret; for(int i = 0;i<4;i++) { if(sel[i] == -1){ @@ -1097,7 +1097,7 @@ static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) { return _mm256_loadu_si256((__m256i*) ptr); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE - return _mm_loadu_si128((__m128i*) ptr) + return _mm_loadu_si128((__m128i*) ptr); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON return vld1q_s16(ptr); diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c index 8d5b9f2d6..4ebed9862 100644 --- a/lib/src/phy/utils/test/vector_test.c +++ b/lib/src/phy/utils/test/vector_test.c @@ -63,12 +63,12 @@ bool verbose = false; strncpy(func_name, #X, 32);\ CODE;\ passed = (mse < MAX_MSE);\ - printf("%32s (%5d) ... %7.1f MSamp/s ... %3s Passed\n", func_name, block_size, \ - (double) block_size*NOF_REPETITIONS/ *timing, passed?"":"Not");\ + printf("%32s (%5d) ... %7.1f MSamp/s ... %3s Passed (%.6f)\n", func_name, block_size, \ + (double) block_size*NOF_REPETITIONS/ *timing, passed?"":"Not", mse);\ return passed;\ } -#define MALLOC(TYPE, NAME) TYPE *NAME = malloc(sizeof(TYPE)*block_size) +#define MALLOC(TYPE, NAME) TYPE *NAME = srslte_vec_malloc(sizeof(TYPE)*block_size) static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) { @@ -507,7 +507,7 @@ TEST(srslte_vec_abs_cf, for (int i = 0; i < block_size; i++) { gold = sqrtf(crealf(x[i]) * crealf(x[i]) + cimagf(x[i])*cimagf(x[i])); - mse += cabsf(gold - z[i]); + mse += cabsf(gold - z[i])/block_size; } free(x); @@ -771,12 +771,27 @@ int main(int argc, char **argv) { size_count++; } + char fname[68]; + FILE *f = NULL; + void * p = popen("(date +%g%m%d && hostname) | tr '\\r\\n' '__'", "r"); + if (p) { + fgets(fname, 64, p); + strncpy(fname + strnlen(fname, 64) - 1, ".tsv", 4); + f = fopen(fname, "w"); + if (f) printf("Saving benchmark results in '%s'\n", fname); + } + pclose(p); + + printf("\n"); printf("%32s |", "Subroutine/MSps"); + if (f) fprintf(f, "Subroutine/MSps Vs Vector size\t"); for (int i = 0; i < size_count; i++) { printf(" %7d", sizes[i]); + if (f) fprintf(f, "%d\t", sizes[i]); } printf(" |\n"); + if (f) fprintf(f, "\n"); for (int j = 0; j < 32; j++) { printf("-"); @@ -789,12 +804,19 @@ int main(int argc, char **argv) { for (int i = 0; i < func_count; i++) { printf("%32s | ", func_names[i]); + if (f) fprintf(f, "%s\t", func_names[i]); + for (int j = 0; j < size_count; j++) { printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]); + if (f) fprintf(f, "%.1f\t", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]); + all_passed &= passed[i][j]; } printf(" |\n"); + if (f) fprintf(f, "\n"); } + if (f) fclose(f); + return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR; } From 3292f9c2694ce5b5a75501427b5a2fd09fc38189 Mon Sep 17 00:00:00 2001 From: yagoda Date: Thu, 19 Oct 2017 16:38:58 +0000 Subject: [PATCH 4/4] simd.h tidy up & small fix for eMBMS --- lib/include/srslte/phy/utils/simd.h | 2 -- lib/src/phy/ue/ue_dl.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 0c378591e..09e9cff8e 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -1027,12 +1027,10 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON - int* a_ptr = (int*) &a; int* b_ptr = (int*) &b; simd_i_t ret; int* sel = (int*) &selector; - int* c_ptr = (int*) &ret; for(int i = 0;i<4;i++) { diff --git a/lib/src/phy/ue/ue_dl.c b/lib/src/phy/ue/ue_dl.c index c4e2d3f6c..c11a29b8b 100644 --- a/lib/src/phy/ue/ue_dl.c +++ b/lib/src/phy/ue/ue_dl.c @@ -603,7 +603,8 @@ int srslte_ue_dl_decode_mbsfn(srslte_ue_dl_t * q, grant.sf_type = SRSLTE_SF_MBSFN; grant.nof_tb = 1; grant.mcs[0].idx = 2; - + grant.tb_en[0] = true; + grant.tb_en[1] = false; grant.nof_prb = q->pmch.cell.nof_prb; srslte_dl_fill_ra_mcs(&grant.mcs[0], grant.nof_prb); srslte_softbuffer_rx_reset_tbs(q->softbuffers[0], (uint32_t) grant.mcs[0].tbs);