From 9e5f999666a97b82fa0558b28294a7b9f62becbf Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Thu, 28 Sep 2017 11:04:26 +0200 Subject: [PATCH] Added more functions --- lib/include/srslte/phy/utils/simd.h | 237 ++++++++++ lib/include/srslte/phy/utils/vector.h | 5 +- lib/include/srslte/phy/utils/vector_simd.h | 63 +-- lib/src/phy/mimo/precoding.c | 3 +- lib/src/phy/sync/find_sss.c | 8 +- lib/src/phy/utils/test/CMakeLists.txt | 3 +- lib/src/phy/utils/test/vector_test.c | 287 +++++++++++- lib/src/phy/utils/vector.c | 134 +----- lib/src/phy/utils/vector_simd.c | 493 ++++++++++++++++----- 9 files changed, 939 insertions(+), 294 deletions(-) diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 22d8db79d..9a5f15dbb 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -88,6 +88,8 @@ #define SRSLTE_SIMD_F_SIZE 16 #define SRSLTE_SIMD_CF_SIZE 16 +#define SRSLTE_SIMD_I_SIZE 16 + #define SRSLTE_SIMD_S_SIZE 32 #define SRSLTE_SIMD_C16_SIZE 0 @@ -97,6 +99,8 @@ #define SRSLTE_SIMD_F_SIZE 8 #define SRSLTE_SIMD_CF_SIZE 8 +#define SRSLTE_SIMD_I_SIZE 8 + #define SRSLTE_SIMD_S_SIZE 16 #define SRSLTE_SIMD_C16_SIZE 16 @@ -106,6 +110,8 @@ #define SRSLTE_SIMD_F_SIZE 4 #define SRSLTE_SIMD_CF_SIZE 4 +#define SRSLTE_SIMD_I_SIZE 4 + #define SRSLTE_SIMD_S_SIZE 8 #define SRSLTE_SIMD_C16_SIZE 8 @@ -114,6 +120,8 @@ #define SRSLTE_SIMD_F_SIZE 0 #define SRSLTE_SIMD_CF_SIZE 0 +#define SRSLTE_SIMD_I_SIZE 0 + #define SRSLTE_SIMD_S_SIZE 0 #define SRSLTE_SIMD_C16_SIZE 0 @@ -223,6 +231,20 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) { #endif /* LV_HAVE_AVX512 */ } +static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) { +#ifdef LV_HAVE_AVX512 + return _mm512_rcp_ps(a); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_rcp_ps(a); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return _mm_rcp_ps(a); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX512 __m512 r = _mm512_add_ps(a, b); @@ -600,6 +622,61 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) { return ret; } +static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) { + simd_cf_t ret; +#ifdef LV_HAVE_AVX512 + b = _mm512_permutexvar_ps(b, _mm512_setr_epi32(0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15)); + ret.re = _mm512_mul_ps(a.re, b); + ret.im = _mm512_mul_ps(a.im, b); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + b = _mm256_permutevar8x32_ps(b, _mm256_setr_epi32(0,4,1,5,2,6,3,7)); + ret.re = _mm256_mul_ps(a.re, b); + ret.im = _mm256_mul_ps(a.im, b); +#else /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_SSE + ret.re = _mm_mul_ps(a.re, b); + ret.im = _mm_mul_ps(a.im, b); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ + return ret; +} + +static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { + simd_cf_t ret; +#ifdef LV_HAVE_AVX512 + simd_f_t a2re = _mm512_mul_ps(a.re, a.re); + simd_f_t a2im = _mm512_mul_ps(a.im, a.im); + simd_f_t mod2 = _mm512_add_ps(a2re, a2im); + simd_f_t rcp = _mm512_rcp_ps(mod2); + simd_f_t neg_a_im = _mm512_xor_ps(_mm512_set1_ps(-0.0f), a.im); + ret.re = _mm512_mul_ps(a.re, rcp); + ret.im = _mm512_mul_ps(neg_a_im, rcp); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + simd_f_t a2re = _mm256_mul_ps(a.re, a.re); + simd_f_t a2im = _mm256_mul_ps(a.im, a.im); + simd_f_t mod2 = _mm256_add_ps(a2re, a2im); + simd_f_t rcp = _mm256_rcp_ps(mod2); + simd_f_t neg_a_im = _mm256_xor_ps(_mm256_set1_ps(-0.0f), a.im); + ret.re = _mm256_mul_ps(a.re, rcp); + ret.im = _mm256_mul_ps(neg_a_im, rcp); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + simd_f_t a2re = _mm_mul_ps(a.re, a.re); + simd_f_t a2im = _mm_mul_ps(a.im, a.im); + simd_f_t mod2 = _mm_add_ps(a2re, a2im); + simd_f_t rcp = _mm_rcp_ps(mod2); + simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im); + ret.re = _mm_mul_ps(a.re, rcp); + ret.im = _mm_mul_ps(neg_a_im, rcp); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ + return ret; +} + static inline simd_cf_t srslte_simd_cf_zero (void) { simd_cf_t ret; #ifdef LV_HAVE_AVX512 @@ -621,6 +698,106 @@ static inline simd_cf_t srslte_simd_cf_zero (void) { #endif /* SRSLTE_SIMD_CF_SIZE */ +#if SRSLTE_SIMD_I_SIZE + +#ifdef LV_HAVE_AVX512 +typedef __m512i simd_i_t; +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 +typedef __m256i simd_i_t; +#else /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_SSE +typedef __m128i simd_i_t; +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ + +static inline simd_i_t srslte_simd_i_load(int *x) { +#ifdef LV_HAVE_AVX512 + return _mm512_load_epi32((__m512i*)x); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_load_si256((__m256i*)x); +#else + #ifdef LV_HAVE_SSE + return _mm_load_si128((__m128i*)x); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline void srslte_simd_i_store(int *x, simd_i_t reg) { +#ifdef LV_HAVE_AVX512 + _mm512_store_epi32((__m512i*)x, reg); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + _mm256_store_si256((__m256i*)x, reg); +#else +#ifdef LV_HAVE_SSE + _mm_store_si128((__m128i*)x, reg); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_i_t srslte_simd_i_set1(int x) { +#ifdef LV_HAVE_AVX512 + return _mm512_set1_epi32(x); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_set1_epi32(x); +#else + #ifdef LV_HAVE_SSE + return _mm_set1_epi32(x); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) { +#ifdef LV_HAVE_AVX512 + return _mm512_add_epi32(a, b); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_add_epi32(a, b); +#else +#ifdef LV_HAVE_SSE + return _mm_add_epi32(a, b); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) { +#ifdef LV_HAVE_AVX512 + return (simd_i_t) _mm512_cmp_ps_mask(a, b, _CMP_GT_OS); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return (simd_i_t) _mm256_cmp_ps(a, b, _CMP_GT_OS); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return (simd_i_t) _mm_cmpgt_ps(a, b); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_i_t selector) { +#ifdef LV_HAVE_AVX512 + return (__m512i) _mm512_blendv_ps((__m512)a, (__m512) b, (__m512) selector); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b,(__m256) selector); +#else + #ifdef LV_HAVE_SSE + return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, (__m128)selector); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +#endif /* SRSLTE_SIMD_I_SIZE*/ + #if SRSLTE_SIMD_S_SIZE @@ -829,6 +1006,20 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) { return ret; } +static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) { + simd_c16_t ret; +#ifdef LV_HAVE_AVX2 + ret.re.m256 = _mm256_loadu_si256((__m256i*)(re)); + ret.im.m256 = _mm256_loadu_si256((__m256i*)(im)); +#else +#ifdef LV_HAVE_SSE + ret.re.m128 = _mm_loadu_si128((__m128i*)(re)); + ret.im.m128 = _mm_loadu_si128((__m128i*)(im)); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ + return ret; +} + static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) { #ifdef LV_HAVE_AVX2 __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001); @@ -845,6 +1036,22 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) { #endif /* LV_HAVE_AVX2 */ } +static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) { +#ifdef LV_HAVE_AVX2 + __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001); + __m256i im_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.im.m256, 0b10110001), 0b10110001); + _mm256_storeu_si256((__m256i *) (ptr), _mm256_blend_epi16(simdreg.re.m256, im_sw, 0b10101010)); + _mm256_storeu_si256((__m256i *) (ptr + 8), _mm256_blend_epi16(re_sw, simdreg.im.m256, 0b10101010)); +#else +#ifdef LV_HAVE_SSE + __m128i re_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.re.m128, 0b10110001), 0b10110001); + __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001); + _mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010)); + _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010)); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +} + static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t simdreg) { #ifdef LV_HAVE_AVX2 _mm256_store_si256((__m256i *) re, simdreg.re.m256); @@ -857,6 +1064,18 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si #endif /* LV_HAVE_AVX2 */ } +static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t simdreg) { +#ifdef LV_HAVE_AVX2 + _mm256_storeu_si256((__m256i *) re, simdreg.re.m256); + _mm256_storeu_si256((__m256i *) im, simdreg.im.m256); +#else +#ifdef LV_HAVE_SSE + _mm_storeu_si128((__m128i *) re, simdreg.re.m128); + _mm_storeu_si128((__m128i *) im, simdreg.im.m128); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +} + static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) { simd_c16_t ret; #ifdef LV_HAVE_AVX2 @@ -905,6 +1124,24 @@ static inline simd_c16_t srslte_simd_c16_zero (void) { #endif /* SRSLTE_SIMD_C16_SIZE */ +#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE +static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) { +#ifdef LV_HAVE_AVX2 + __m256 aa = _mm256_permute2f128_ps(a, b, 0x20); + __m256 bb = _mm256_permute2f128_ps(a, b, 0x31); + __m256i ai = _mm256_cvttps_epi32(aa); + __m256i bi = _mm256_cvttps_epi32(bb); + return _mm256_packs_epi32(ai, bi); +#else +#ifdef LV_HAVE_SSE + __m128i ai = _mm_cvttps_epi32(a); + __m128i bi = _mm_cvttps_epi32(b); + return _mm_packs_epi32(ai, bi); +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +} + +#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_C16_SIZE */ #endif //SRSLTE_SIMD_H_H diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h index 0fadfb334..9b99c6fff 100644 --- a/lib/include/srslte/phy/utils/vector.h +++ b/lib/include/srslte/phy/utils/vector.h @@ -123,6 +123,7 @@ SRSLTE_API void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint /* vector product (element-wise) */ SRSLTE_API void srslte_vec_prod_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len); /* vector product (element-wise) */ SRSLTE_API void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len); @@ -142,8 +143,8 @@ SRSLTE_API float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len); SRSLTE_API int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len); /* z=x/y vector division (element-wise) */ -SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len); -void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len); +SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len); +SRSLTE_API void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len); /* conjugate */ diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h index 4ee839fab..294cff50f 100644 --- a/lib/include/srslte/phy/utils/vector_simd.h +++ b/lib/include/srslte/phy/utils/vector_simd.h @@ -36,26 +36,29 @@ extern "C" { #include "srslte/config.h" #ifdef LV_HAVE_AVX512 +#define SRSLTE_SIMD_BIT_ALIGN 512 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0) #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX +#define SRSLTE_SIMD_BIT_ALIGN 256 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0) #else /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE +#define SRSLTE_SIMD_BIT_ALIGN 128 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0) #else /* LV_HAVE_SSE */ +#define SRSLTE_SIMD_BIT_ALIGN 64 #define SRSLTE_IS_ALIGNED(PTR) (1) #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX512 */ -SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len); - +/* SIMD Basic vector math */ SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); -SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len); +SRSLTE_API float srslte_vec_acc_ff_simd(float *x, int len); SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len); @@ -63,59 +66,63 @@ SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len); SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len); +/* SIMD Vector Scalar Product */ +SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len); + SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len); SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len); +/* SIMD Vector Product */ +SRSLTE_API void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len); + +SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re, + int16_t *r_im, int len); + +SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); + +SRSLTE_API void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len); + SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len); SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len); SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len); -SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len); +/* SIMD Division */ +SRSLTE_API void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len); -SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re, - int16_t *r_im, int len); +SRSLTE_API void srslte_vec_div_cfc_simd(cf_t *x, float *y, cf_t *z, int len); -SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); +SRSLTE_API void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len); +/* SIMD Dot product */ SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len); SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len); -SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len); - SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len); -SRSLTE_API void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len); +SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len); +/* SIMD Modulus functions */ SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len); SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len); -SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); +/* Other Functions */ +SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len); -SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len); - -SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len); - -SRSLTE_API void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len); - -SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len); - -SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); - -SRSLTE_API void srslte_vec_mult_scalar_cf_f_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len); - -SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len); - -SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); - -SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len); +SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len); SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len); + +/* SIMD Find Max functions */ +SRSLTE_API uint32_t srslte_vec_max_fi_simd(float *x, int len); + +SRSLTE_API uint32_t srslte_vec_max_ci_simd(cf_t *x, int len); + #ifdef __cplusplus } #endif diff --git a/lib/src/phy/mimo/precoding.c b/lib/src/phy/mimo/precoding.c index f1aab3b5d..8f8bd7737 100644 --- a/lib/src/phy/mimo/precoding.c +++ b/lib/src/phy/mimo/precoding.c @@ -36,17 +36,16 @@ #ifdef LV_HAVE_SSE #include -#include "srslte/phy/utils/mat.h" int srslte_predecoding_single_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate); int srslte_predecoding_diversity2_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_symbols); #endif #ifdef LV_HAVE_AVX #include -#include "srslte/phy/utils/mat.h" int srslte_predecoding_single_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate); #endif +#include "srslte/phy/utils/mat.h" static srslte_mimo_decoder_t mimo_decoder = SRSLTE_MIMO_DECODER_MMSE; diff --git a/lib/src/phy/sync/find_sss.c b/lib/src/phy/sync/find_sss.c index 2afeced42..082aee52a 100644 --- a/lib/src/phy/sync/find_sss.c +++ b/lib/src/phy/sync/find_sss.c @@ -70,14 +70,12 @@ static void corr_all_sz_partial(cf_t z[SRSLTE_SSS_N], float s[SRSLTE_SSS_N][SRSL static void extract_pair_sss(srslte_sss_synch_t *q, cf_t *input, cf_t *ce, cf_t y[2][SRSLTE_SSS_N]) { cf_t input_fft[SRSLTE_SYMBOL_SZ_MAX]; - float ce_mod[2*SRSLTE_SSS_N], z_real[2*SRSLTE_SSS_N], z_imag[2*SRSLTE_SSS_N]; - + srslte_dft_run_c(&q->dftp_input, input, input_fft); if (ce) { - srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce, ce_mod, - &input_fft[q->fft_size/2-SRSLTE_SSS_N], z_real, z_imag, - 2*SRSLTE_SSS_N); + srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce, + &input_fft[q->fft_size/2-SRSLTE_SSS_N], 2*SRSLTE_SSS_N); } for (int i = 0; i < SRSLTE_SSS_N; i++) { diff --git a/lib/src/phy/utils/test/CMakeLists.txt b/lib/src/phy/utils/test/CMakeLists.txt index 76df7ac59..1f5c66827 100644 --- a/lib/src/phy/utils/test/CMakeLists.txt +++ b/lib/src/phy/utils/test/CMakeLists.txt @@ -44,4 +44,5 @@ add_test(algebra_2x2_zf_solver_test algebra_test -z) add_test(algebra_2x2_mmse_solver_test algebra_test -m) add_executable(vector_test vector_test.c) -target_link_libraries(vector_test srslte_phy) \ No newline at end of file +target_link_libraries(vector_test srslte_phy) +add_test(vector_test vector_test) diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c index 05dce1d35..cf0f38926 100644 --- a/lib/src/phy/utils/test/vector_test.c +++ b/lib/src/phy/utils/test/vector_test.c @@ -89,6 +89,26 @@ float squared_error (cf_t a, cf_t b) { return diff_re*diff_re + diff_im*diff_im; } +TEST(srslte_vec_acc_ff, + MALLOC(float, x); + float z; + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + } + + TEST_CALL(z = srslte_vec_acc_ff(x, block_size)) + + for (int i = 0; i < block_size; i++) { + gold += x[i]; + } + + mse += fabs(gold - z) / gold; + + free(x); +) + TEST(srslte_vec_dot_prod_sss, MALLOC(int16_t, x); MALLOC(int16_t, y); @@ -314,6 +334,37 @@ TEST(srslte_vec_prod_ccc, free(z); ) +TEST(srslte_vec_prod_ccc_split, + MALLOC(float, x_re); + MALLOC(float, x_im); + MALLOC(float, y_re); + MALLOC(float, y_im); + MALLOC(float, z_re); + MALLOC(float, z_im); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x_re[i] = RANDOM_F(); + x_im[i] = RANDOM_F(); + y_re[i] = RANDOM_F(); + y_im[i] = RANDOM_F(); + } + + TEST_CALL(srslte_vec_prod_ccc_split(x_re, x_im, y_re, y_im, z_re, z_im, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = (x_re[i] + I * x_im[i]) * (y_re[i] + I * y_im[i]); + mse += cabsf(gold - (z_re[i] + I*z_im[i])); + } + + free(x_re); + free(x_im); + free(y_re); + free(y_im); + free(z_re); + free(z_im); +) + TEST(srslte_vec_prod_conj_ccc, MALLOC(cf_t, x); MALLOC(cf_t, y); @@ -357,6 +408,27 @@ TEST(srslte_vec_sc_prod_ccc, free(z); ) +TEST(srslte_vec_convert_fi, + MALLOC(float, x); + MALLOC(short, z); + float scale = 1000.0f; + + short gold; + for (int i = 0; i < block_size; i++) { + x[i] = (float) RANDOM_F(); + } + + TEST_CALL(srslte_vec_convert_fi(x, z, scale, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = (short) ((x[i] * scale)); + mse += cabsf((float)gold - (float) z[i]); + } + + free(x); + free(z); +) + TEST(srslte_vec_prod_fff, MALLOC(float, x); MALLOC(float, y); @@ -376,6 +448,30 @@ TEST(srslte_vec_prod_fff, } free(x); + free(y); + free(z); +) + +TEST(srslte_vec_prod_cfc, + MALLOC(cf_t, x); + MALLOC(float, y); + MALLOC(cf_t, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_F(); + } + + TEST_CALL(srslte_vec_prod_cfc(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(y); free(z); ) @@ -461,66 +557,216 @@ TEST(srslte_vec_sc_prod_cfc, free(z); ) +TEST(srslte_vec_div_ccc, + MALLOC(cf_t, x); + MALLOC(cf_t, y); + MALLOC(cf_t, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_div_ccc(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] / y[i]; + mse += cabsf(gold - z[i]); + } + mse /= block_size; + + free(x); + free(y); + free(z); +) + + +TEST(srslte_vec_div_cfc, + MALLOC(cf_t, x); + MALLOC(float, y); + MALLOC(cf_t, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_F(); + } + + TEST_CALL(srslte_vec_div_cfc(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] / y[i]; + mse += cabsf(gold - z[i])/cabsf(gold); + } + mse /= block_size; + + free(x); + free(y); + free(z); +) + + +TEST(srslte_vec_div_fff, + MALLOC(float, x); + MALLOC(float, y); + MALLOC(float, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + y[i] = RANDOM_F(); + } + + TEST_CALL(srslte_vec_div_fff(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] / y[i]; + mse += cabsf(gold - z[i]); + } + mse /= block_size; + + free(x); + free(y); + free(z); +) + +TEST(srslte_vec_max_fi, + MALLOC(float, x); + + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + } + + uint32_t max_index = 0; + TEST_CALL(max_index = srslte_vec_max_fi(x, block_size);) + + float gold_value = -INFINITY; + uint32_t gold_index = 0; + for (int i = 0; i < block_size; i++) { + if (gold_value < x[i]) { + gold_value = x[i]; + gold_index = i; + } + } + mse = (gold_index != max_index) ? 1:0; + + free(x); +) + +TEST(srslte_vec_max_abs_ci, + MALLOC(cf_t, x); + + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + } + + uint32_t max_index = 0; + TEST_CALL(max_index = srslte_vec_max_abs_ci(x, block_size);) + + float gold_value = -INFINITY; + uint32_t gold_index = 0; + for (int i = 0; i < block_size; i++) { + cf_t a = x[i]; + float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a; + if (abs2 > gold_value) { + gold_value = abs2; + gold_index = (uint32_t)i; + } + } + mse = (gold_index != max_index) ? 1:0; + + free(x); +) + int main(int argc, char **argv) { char func_names[MAX_FUNCTIONS][32]; double timmings[MAX_FUNCTIONS][MAX_BLOCKS]; uint32_t sizes[32]; uint32_t size_count = 0; uint32_t func_count = 0; - bool passed = true; + bool passed[MAX_FUNCTIONS][MAX_BLOCKS]; + bool all_passed = true; for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) { func_count = 0; - passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_acc_ff(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_convert_fi(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_prod_ccc_split(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; - passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size); + passed[func_count][size_count] = test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_div_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_div_cfc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_div_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; sizes[size_count] = block_size; @@ -546,10 +792,11 @@ int main(int argc, char **argv) { for (int i = 0; i < func_count; i++) { printf("%32s | ", func_names[i]); for (int j = 0; j < size_count; j++) { - printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]); + printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]); + all_passed &= passed[i][j]; } printf(" |\n"); } - return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR; + return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR; } diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index cb21f24f1..f85dbca0a 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -48,18 +48,7 @@ int srslte_vec_acc_ii(int *x, uint32_t len) { // Used in PRACH detector, AGC and chest_dl for noise averaging float srslte_vec_acc_ff(float *x, uint32_t len) { -#ifdef HAVE_VOLK_ACC_FUNCTION - float result; - volk_32f_accumulator_s32f(&result,x,len); - return result; -#else - int i; - float z=0; - for (i=0;im) { - m=x[i]; - p=i; - } - } - return p; -#endif -#endif + return srslte_vec_max_fi_simd(x, len); } int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) { @@ -616,30 +541,7 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) { // CP autocorr uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) { -#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_32 - uint32_t target=0; - volk_32fc_index_max_32u(&target,x,len); - return target; -#else -#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_16 - uint32_t target=0; - volk_32fc_index_max_16u(&target,x,len); - return target; -#else - uint32_t i; - float m=-FLT_MAX; - uint32_t p=0; - float tmp; - for (i=0;im) { - m=tmp; - p=i; - } - } - return p; -#endif -#endif + return srslte_vec_max_ci_simd(x, len); } void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) { diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index 2eb0428b7..2dd354548 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -232,143 +232,113 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len) /* No improvement with AVX */ -void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len) -{ -#ifdef DEBUG_MODE - for (int i=0;i max_value) { + max_value = values_buffer[k]; + max_index = (uint32_t) indexes_buffer[k]; + } + } +#endif /* SRSLTE_SIMD_I_SIZE */ + + for (; i < len; i++) { + if (x[i] > max_value) { + max_value = x[i]; + max_index = (uint32_t)i; + } + } + + return max_index; +} + +uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) { + int i = 0; + + float max_value = -INFINITY; + uint32_t max_index = 0; + +#if SRSLTE_SIMD_I_SIZE + __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0}; + __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0}; + + for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k; + simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE); + simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer); + simd_i_t simd_max_indexes = srslte_simd_i_set1(0); + + simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY); + + if (SRSLTE_IS_ALIGNED(x)) { + for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) { + simd_f_t x1 = srslte_simd_f_load((float *) &x[i]); + simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]); + + simd_f_t mul1 = srslte_simd_f_mul(x1, x1); + simd_f_t mul2 = srslte_simd_f_mul(x2, x2); + + simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2); + + simd_i_t res = srslte_simd_f_max(z1, simd_max_values); + + simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res); + simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res); + simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc); + } + } else { + for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) { + simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]); + simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]); + + simd_f_t mul1 = srslte_simd_f_mul(x1, x1); + simd_f_t mul2 = srslte_simd_f_mul(x2, x2); + + simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2); + + simd_i_t res = srslte_simd_f_max(z1, simd_max_values); + + simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res); + simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res); + simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc); + } + } + + srslte_simd_i_store(indexes_buffer, simd_max_indexes); + srslte_simd_f_store(values_buffer, simd_max_values); + + for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) { + if (values_buffer[k] > max_value) { + max_value = values_buffer[k]; + max_index = (uint32_t) indexes_buffer[k]; + } + } +#endif /* SRSLTE_SIMD_I_SIZE */ + + for (; i < len; i++) { + cf_t a = x[i]; + float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a; + if (abs2 > max_value) { + max_value = abs2; + max_index = (uint32_t)i; + } + } + + return max_index; +}