Added more functions

This commit is contained in:
Xavier Arteaga 2017-09-28 11:04:26 +02:00
parent c41ad5453c
commit 9e5f999666
9 changed files with 939 additions and 294 deletions

View File

@ -88,6 +88,8 @@
#define SRSLTE_SIMD_F_SIZE 16
#define SRSLTE_SIMD_CF_SIZE 16
#define SRSLTE_SIMD_I_SIZE 16
#define SRSLTE_SIMD_S_SIZE 32
#define SRSLTE_SIMD_C16_SIZE 0
@ -97,6 +99,8 @@
#define SRSLTE_SIMD_F_SIZE 8
#define SRSLTE_SIMD_CF_SIZE 8
#define SRSLTE_SIMD_I_SIZE 8
#define SRSLTE_SIMD_S_SIZE 16
#define SRSLTE_SIMD_C16_SIZE 16
@ -106,6 +110,8 @@
#define SRSLTE_SIMD_F_SIZE 4
#define SRSLTE_SIMD_CF_SIZE 4
#define SRSLTE_SIMD_I_SIZE 4
#define SRSLTE_SIMD_S_SIZE 8
#define SRSLTE_SIMD_C16_SIZE 8
@ -114,6 +120,8 @@
#define SRSLTE_SIMD_F_SIZE 0
#define SRSLTE_SIMD_CF_SIZE 0
#define SRSLTE_SIMD_I_SIZE 0
#define SRSLTE_SIMD_S_SIZE 0
#define SRSLTE_SIMD_C16_SIZE 0
@ -223,6 +231,20 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
#endif /* LV_HAVE_AVX512 */
}
static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
#ifdef LV_HAVE_AVX512
return _mm512_rcp_ps(a);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_rcp_ps(a);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_rcp_ps(a);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
#ifdef LV_HAVE_AVX512
__m512 r = _mm512_add_ps(a, b);
@ -600,6 +622,61 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
return ret;
}
static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
simd_cf_t ret;
#ifdef LV_HAVE_AVX512
b = _mm512_permutexvar_ps(b, _mm512_setr_epi32(0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
ret.re = _mm512_mul_ps(a.re, b);
ret.im = _mm512_mul_ps(a.im, b);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
b = _mm256_permutevar8x32_ps(b, _mm256_setr_epi32(0,4,1,5,2,6,3,7));
ret.re = _mm256_mul_ps(a.re, b);
ret.im = _mm256_mul_ps(a.im, b);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
ret.re = _mm_mul_ps(a.re, b);
ret.im = _mm_mul_ps(a.im, b);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
return ret;
}
static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
simd_cf_t ret;
#ifdef LV_HAVE_AVX512
simd_f_t a2re = _mm512_mul_ps(a.re, a.re);
simd_f_t a2im = _mm512_mul_ps(a.im, a.im);
simd_f_t mod2 = _mm512_add_ps(a2re, a2im);
simd_f_t rcp = _mm512_rcp_ps(mod2);
simd_f_t neg_a_im = _mm512_xor_ps(_mm512_set1_ps(-0.0f), a.im);
ret.re = _mm512_mul_ps(a.re, rcp);
ret.im = _mm512_mul_ps(neg_a_im, rcp);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
simd_f_t a2re = _mm256_mul_ps(a.re, a.re);
simd_f_t a2im = _mm256_mul_ps(a.im, a.im);
simd_f_t mod2 = _mm256_add_ps(a2re, a2im);
simd_f_t rcp = _mm256_rcp_ps(mod2);
simd_f_t neg_a_im = _mm256_xor_ps(_mm256_set1_ps(-0.0f), a.im);
ret.re = _mm256_mul_ps(a.re, rcp);
ret.im = _mm256_mul_ps(neg_a_im, rcp);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
simd_f_t a2re = _mm_mul_ps(a.re, a.re);
simd_f_t a2im = _mm_mul_ps(a.im, a.im);
simd_f_t mod2 = _mm_add_ps(a2re, a2im);
simd_f_t rcp = _mm_rcp_ps(mod2);
simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im);
ret.re = _mm_mul_ps(a.re, rcp);
ret.im = _mm_mul_ps(neg_a_im, rcp);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
return ret;
}
static inline simd_cf_t srslte_simd_cf_zero (void) {
simd_cf_t ret;
#ifdef LV_HAVE_AVX512
@ -621,6 +698,106 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {
#endif /* SRSLTE_SIMD_CF_SIZE */
#if SRSLTE_SIMD_I_SIZE
#ifdef LV_HAVE_AVX512
typedef __m512i simd_i_t;
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
typedef __m256i simd_i_t;
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
typedef __m128i simd_i_t;
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
static inline simd_i_t srslte_simd_i_load(int *x) {
#ifdef LV_HAVE_AVX512
return _mm512_load_epi32((__m512i*)x);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_load_si256((__m256i*)x);
#else
#ifdef LV_HAVE_SSE
return _mm_load_si128((__m128i*)x);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline void srslte_simd_i_store(int *x, simd_i_t reg) {
#ifdef LV_HAVE_AVX512
_mm512_store_epi32((__m512i*)x, reg);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
_mm256_store_si256((__m256i*)x, reg);
#else
#ifdef LV_HAVE_SSE
_mm_store_si128((__m128i*)x, reg);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_i_t srslte_simd_i_set1(int x) {
#ifdef LV_HAVE_AVX512
return _mm512_set1_epi32(x);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_set1_epi32(x);
#else
#ifdef LV_HAVE_SSE
return _mm_set1_epi32(x);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
#ifdef LV_HAVE_AVX512
return _mm512_add_epi32(a, b);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_add_epi32(a, b);
#else
#ifdef LV_HAVE_SSE
return _mm_add_epi32(a, b);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
#ifdef LV_HAVE_AVX512
return (simd_i_t) _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return (simd_i_t) _mm256_cmp_ps(a, b, _CMP_GT_OS);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return (simd_i_t) _mm_cmpgt_ps(a, b);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_i_t selector) {
#ifdef LV_HAVE_AVX512
return (__m512i) _mm512_blendv_ps((__m512)a, (__m512) b, (__m512) selector);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b,(__m256) selector);
#else
#ifdef LV_HAVE_SSE
return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, (__m128)selector);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
#endif /* SRSLTE_SIMD_I_SIZE*/
#if SRSLTE_SIMD_S_SIZE
@ -829,6 +1006,20 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) {
return ret;
}
static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) {
simd_c16_t ret;
#ifdef LV_HAVE_AVX2
ret.re.m256 = _mm256_loadu_si256((__m256i*)(re));
ret.im.m256 = _mm256_loadu_si256((__m256i*)(im));
#else
#ifdef LV_HAVE_SSE
ret.re.m128 = _mm_loadu_si128((__m128i*)(re));
ret.im.m128 = _mm_loadu_si128((__m128i*)(im));
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
return ret;
}
static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
#ifdef LV_HAVE_AVX2
__m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
@ -845,6 +1036,22 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
#endif /* LV_HAVE_AVX2 */
}
static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) {
#ifdef LV_HAVE_AVX2
__m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
__m256i im_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.im.m256, 0b10110001), 0b10110001);
_mm256_storeu_si256((__m256i *) (ptr), _mm256_blend_epi16(simdreg.re.m256, im_sw, 0b10101010));
_mm256_storeu_si256((__m256i *) (ptr + 8), _mm256_blend_epi16(re_sw, simdreg.im.m256, 0b10101010));
#else
#ifdef LV_HAVE_SSE
__m128i re_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.re.m128, 0b10110001), 0b10110001);
__m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
_mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
_mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
}
static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t simdreg) {
#ifdef LV_HAVE_AVX2
_mm256_store_si256((__m256i *) re, simdreg.re.m256);
@ -857,6 +1064,18 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si
#endif /* LV_HAVE_AVX2 */
}
static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t simdreg) {
#ifdef LV_HAVE_AVX2
_mm256_storeu_si256((__m256i *) re, simdreg.re.m256);
_mm256_storeu_si256((__m256i *) im, simdreg.im.m256);
#else
#ifdef LV_HAVE_SSE
_mm_storeu_si128((__m128i *) re, simdreg.re.m128);
_mm_storeu_si128((__m128i *) im, simdreg.im.m128);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
}
static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
simd_c16_t ret;
#ifdef LV_HAVE_AVX2
@ -905,6 +1124,24 @@ static inline simd_c16_t srslte_simd_c16_zero (void) {
#endif /* SRSLTE_SIMD_C16_SIZE */
#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
#ifdef LV_HAVE_AVX2
__m256 aa = _mm256_permute2f128_ps(a, b, 0x20);
__m256 bb = _mm256_permute2f128_ps(a, b, 0x31);
__m256i ai = _mm256_cvttps_epi32(aa);
__m256i bi = _mm256_cvttps_epi32(bb);
return _mm256_packs_epi32(ai, bi);
#else
#ifdef LV_HAVE_SSE
__m128i ai = _mm_cvttps_epi32(a);
__m128i bi = _mm_cvttps_epi32(b);
return _mm_packs_epi32(ai, bi);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
}
#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_C16_SIZE */
#endif //SRSLTE_SIMD_H_H

View File

@ -123,6 +123,7 @@ SRSLTE_API void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint
/* vector product (element-wise) */
SRSLTE_API void srslte_vec_prod_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
SRSLTE_API void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len);
/* vector product (element-wise) */
SRSLTE_API void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
@ -142,8 +143,8 @@ SRSLTE_API float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len);
SRSLTE_API int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len);
/* z=x/y vector division (element-wise) */
SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len);
void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len);
SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
SRSLTE_API void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
SRSLTE_API void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len);
/* conjugate */

View File

@ -36,26 +36,29 @@ extern "C" {
#include "srslte/config.h"
#ifdef LV_HAVE_AVX512
#define SRSLTE_SIMD_BIT_ALIGN 512
#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0)
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX
#define SRSLTE_SIMD_BIT_ALIGN 256
#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0)
#else /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
#define SRSLTE_SIMD_BIT_ALIGN 128
#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
#else /* LV_HAVE_SSE */
#define SRSLTE_SIMD_BIT_ALIGN 64
#define SRSLTE_IS_ALIGNED(PTR) (1)
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
#endif /* LV_HAVE_AVX512 */
SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
/* SIMD Basic vector math */
SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);
SRSLTE_API float srslte_vec_acc_ff_simd(float *x, int len);
SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len);
@ -63,59 +66,63 @@ SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len);
SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len);
/* SIMD Vector Scalar Product */
SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len);
SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len);
/* SIMD Vector Product */
SRSLTE_API void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
int16_t *r_im, int len);
SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
SRSLTE_API void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len);
SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
/* SIMD Division */
SRSLTE_API void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
int16_t *r_im, int len);
SRSLTE_API void srslte_vec_div_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
SRSLTE_API void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len);
/* SIMD Dot product */
SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len);
SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len);
SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len);
SRSLTE_API void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);
SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
/* SIMD Modulus functions */
SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len);
SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len);
SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
/* Other Functions */
SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len);
SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len);
SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len);
SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len);
SRSLTE_API void srslte_vec_mult_scalar_cf_f_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len);
SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len);
SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len);
SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len);
SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len);
/* SIMD Find Max functions */
SRSLTE_API uint32_t srslte_vec_max_fi_simd(float *x, int len);
SRSLTE_API uint32_t srslte_vec_max_ci_simd(cf_t *x, int len);
#ifdef __cplusplus
}
#endif

View File

@ -36,17 +36,16 @@
#ifdef LV_HAVE_SSE
#include <immintrin.h>
#include "srslte/phy/utils/mat.h"
int srslte_predecoding_single_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
int srslte_predecoding_diversity2_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_symbols);
#endif
#ifdef LV_HAVE_AVX
#include <immintrin.h>
#include "srslte/phy/utils/mat.h"
int srslte_predecoding_single_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
#endif
#include "srslte/phy/utils/mat.h"
static srslte_mimo_decoder_t mimo_decoder = SRSLTE_MIMO_DECODER_MMSE;

View File

@ -70,14 +70,12 @@ static void corr_all_sz_partial(cf_t z[SRSLTE_SSS_N], float s[SRSLTE_SSS_N][SRSL
static void extract_pair_sss(srslte_sss_synch_t *q, cf_t *input, cf_t *ce, cf_t y[2][SRSLTE_SSS_N]) {
cf_t input_fft[SRSLTE_SYMBOL_SZ_MAX];
float ce_mod[2*SRSLTE_SSS_N], z_real[2*SRSLTE_SSS_N], z_imag[2*SRSLTE_SSS_N];
srslte_dft_run_c(&q->dftp_input, input, input_fft);
if (ce) {
srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce, ce_mod,
&input_fft[q->fft_size/2-SRSLTE_SSS_N], z_real, z_imag,
2*SRSLTE_SSS_N);
srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce,
&input_fft[q->fft_size/2-SRSLTE_SSS_N], 2*SRSLTE_SSS_N);
}
for (int i = 0; i < SRSLTE_SSS_N; i++) {

View File

@ -44,4 +44,5 @@ add_test(algebra_2x2_zf_solver_test algebra_test -z)
add_test(algebra_2x2_mmse_solver_test algebra_test -m)
add_executable(vector_test vector_test.c)
target_link_libraries(vector_test srslte_phy)
target_link_libraries(vector_test srslte_phy)
add_test(vector_test vector_test)

View File

@ -89,6 +89,26 @@ float squared_error (cf_t a, cf_t b) {
return diff_re*diff_re + diff_im*diff_im;
}
TEST(srslte_vec_acc_ff,
MALLOC(float, x);
float z;
cf_t gold = 0.0f;
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_F();
}
TEST_CALL(z = srslte_vec_acc_ff(x, block_size))
for (int i = 0; i < block_size; i++) {
gold += x[i];
}
mse += fabs(gold - z) / gold;
free(x);
)
TEST(srslte_vec_dot_prod_sss,
MALLOC(int16_t, x);
MALLOC(int16_t, y);
@ -314,6 +334,37 @@ TEST(srslte_vec_prod_ccc,
free(z);
)
TEST(srslte_vec_prod_ccc_split,
MALLOC(float, x_re);
MALLOC(float, x_im);
MALLOC(float, y_re);
MALLOC(float, y_im);
MALLOC(float, z_re);
MALLOC(float, z_im);
cf_t gold;
for (int i = 0; i < block_size; i++) {
x_re[i] = RANDOM_F();
x_im[i] = RANDOM_F();
y_re[i] = RANDOM_F();
y_im[i] = RANDOM_F();
}
TEST_CALL(srslte_vec_prod_ccc_split(x_re, x_im, y_re, y_im, z_re, z_im, block_size))
for (int i = 0; i < block_size; i++) {
gold = (x_re[i] + I * x_im[i]) * (y_re[i] + I * y_im[i]);
mse += cabsf(gold - (z_re[i] + I*z_im[i]));
}
free(x_re);
free(x_im);
free(y_re);
free(y_im);
free(z_re);
free(z_im);
)
TEST(srslte_vec_prod_conj_ccc,
MALLOC(cf_t, x);
MALLOC(cf_t, y);
@ -357,6 +408,27 @@ TEST(srslte_vec_sc_prod_ccc,
free(z);
)
TEST(srslte_vec_convert_fi,
MALLOC(float, x);
MALLOC(short, z);
float scale = 1000.0f;
short gold;
for (int i = 0; i < block_size; i++) {
x[i] = (float) RANDOM_F();
}
TEST_CALL(srslte_vec_convert_fi(x, z, scale, block_size))
for (int i = 0; i < block_size; i++) {
gold = (short) ((x[i] * scale));
mse += cabsf((float)gold - (float) z[i]);
}
free(x);
free(z);
)
TEST(srslte_vec_prod_fff,
MALLOC(float, x);
MALLOC(float, y);
@ -376,6 +448,30 @@ TEST(srslte_vec_prod_fff,
}
free(x);
free(y);
free(z);
)
TEST(srslte_vec_prod_cfc,
MALLOC(cf_t, x);
MALLOC(float, y);
MALLOC(cf_t, z);
cf_t gold;
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_CF();
y[i] = RANDOM_F();
}
TEST_CALL(srslte_vec_prod_cfc(x, y, z, block_size))
for (int i = 0; i < block_size; i++) {
gold = x[i] * y[i];
mse += cabsf(gold - z[i]);
}
free(x);
free(y);
free(z);
)
@ -461,66 +557,216 @@ TEST(srslte_vec_sc_prod_cfc,
free(z);
)
TEST(srslte_vec_div_ccc,
MALLOC(cf_t, x);
MALLOC(cf_t, y);
MALLOC(cf_t, z);
cf_t gold;
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_CF();
y[i] = RANDOM_CF();
}
TEST_CALL(srslte_vec_div_ccc(x, y, z, block_size))
for (int i = 0; i < block_size; i++) {
gold = x[i] / y[i];
mse += cabsf(gold - z[i]);
}
mse /= block_size;
free(x);
free(y);
free(z);
)
TEST(srslte_vec_div_cfc,
MALLOC(cf_t, x);
MALLOC(float, y);
MALLOC(cf_t, z);
cf_t gold;
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_CF();
y[i] = RANDOM_F();
}
TEST_CALL(srslte_vec_div_cfc(x, y, z, block_size))
for (int i = 0; i < block_size; i++) {
gold = x[i] / y[i];
mse += cabsf(gold - z[i])/cabsf(gold);
}
mse /= block_size;
free(x);
free(y);
free(z);
)
TEST(srslte_vec_div_fff,
MALLOC(float, x);
MALLOC(float, y);
MALLOC(float, z);
cf_t gold;
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_F();
y[i] = RANDOM_F();
}
TEST_CALL(srslte_vec_div_fff(x, y, z, block_size))
for (int i = 0; i < block_size; i++) {
gold = x[i] / y[i];
mse += cabsf(gold - z[i]);
}
mse /= block_size;
free(x);
free(y);
free(z);
)
TEST(srslte_vec_max_fi,
MALLOC(float, x);
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_F();
}
uint32_t max_index = 0;
TEST_CALL(max_index = srslte_vec_max_fi(x, block_size);)
float gold_value = -INFINITY;
uint32_t gold_index = 0;
for (int i = 0; i < block_size; i++) {
if (gold_value < x[i]) {
gold_value = x[i];
gold_index = i;
}
}
mse = (gold_index != max_index) ? 1:0;
free(x);
)
TEST(srslte_vec_max_abs_ci,
MALLOC(cf_t, x);
for (int i = 0; i < block_size; i++) {
x[i] = RANDOM_CF();
}
uint32_t max_index = 0;
TEST_CALL(max_index = srslte_vec_max_abs_ci(x, block_size);)
float gold_value = -INFINITY;
uint32_t gold_index = 0;
for (int i = 0; i < block_size; i++) {
cf_t a = x[i];
float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
if (abs2 > gold_value) {
gold_value = abs2;
gold_index = (uint32_t)i;
}
}
mse = (gold_index != max_index) ? 1:0;
free(x);
)
int main(int argc, char **argv) {
char func_names[MAX_FUNCTIONS][32];
double timmings[MAX_FUNCTIONS][MAX_BLOCKS];
uint32_t sizes[32];
uint32_t size_count = 0;
uint32_t func_count = 0;
bool passed = true;
bool passed[MAX_FUNCTIONS][MAX_BLOCKS];
bool all_passed = true;
for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) {
func_count = 0;
passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_acc_ff(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_convert_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_prod_ccc_split(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
passed[func_count][size_count] = test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_div_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_div_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_div_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size);
func_count++;
sizes[size_count] = block_size;
@ -546,10 +792,11 @@ int main(int argc, char **argv) {
for (int i = 0; i < func_count; i++) {
printf("%32s | ", func_names[i]);
for (int j = 0; j < size_count; j++) {
printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
all_passed &= passed[i][j];
}
printf(" |\n");
}
return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
}

View File

@ -48,18 +48,7 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {
// Used in PRACH detector, AGC and chest_dl for noise averaging
float srslte_vec_acc_ff(float *x, uint32_t len) {
#ifdef HAVE_VOLK_ACC_FUNCTION
float result;
volk_32f_accumulator_s32f(&result,x,len);
return result;
#else
int i;
float z=0;
for (i=0;i<len;i++) {
z+=x[i];
}
return z;
#endif
return srslte_vec_acc_ff_simd(x, len);
}
void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len) {
@ -190,14 +179,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
// Chest UL
void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
#ifndef LV_HAVE_SSE
int i;
for (i=0;i<len;i++) {
z[i] = x[i]*h;
}
#else
srslte_vec_sc_prod_ccc_simd(x,h,z,len);
#endif
}
// Used in turbo decoder
@ -217,14 +199,7 @@ void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len) {
}
void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
#ifndef LV_HAVE_SSE
int i;
for (i=0;i<len;i++) {
z[i] = (int16_t) (x[i]*scale);
}
#else
srslte_vec_convert_fi_sse(x, z, scale, len);
#endif
srslte_vec_convert_fi_simd(x, z, scale, len);
}
void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
@ -234,13 +209,7 @@ void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
}
void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
#ifndef LV_HAVE_SSE
for (int i=0;i<len;i++) {
y[lut[i]] = x[i];
}
#else
srslte_vec_lut_sss_sse(x, lut, y, len);
#endif
srslte_vec_lut_sss_simd(x, lut, y, len);
}
void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
@ -280,7 +249,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
*/
void *srslte_vec_malloc(uint32_t size) {
void *ptr;
if (posix_memalign(&ptr,512,size)) {
if (posix_memalign(&ptr, SRSLTE_SIMD_BIT_ALIGN, size)) {
return NULL;
} else {
return ptr;
@ -292,7 +261,7 @@ void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
return realloc(ptr, new_size);
#else
void *new_ptr;
if (posix_memalign(&new_ptr,256,new_size)) {
if (posix_memalign(&new_ptr, SRSLTE_SIMD_BIT_ALIGN, new_size)) {
return NULL;
} else {
memcpy(new_ptr, ptr, old_size);
@ -415,6 +384,7 @@ void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) {
// Used in PSS
void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
/* This function is used in initialisation only, then no optimisation is required */
int i;
for (i=0;i<len;i++) {
y[i] = conjf(x[i]);
@ -423,10 +393,7 @@ void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
// Used in scrambling complex
void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
int i;
for (i=0;i<len;i++) {
z[i] = x[i]*y[i];
}
srslte_vec_prod_cfc_simd(x, y, z, len);
}
// Used in scrambling float
@ -444,6 +411,10 @@ void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
srslte_vec_prod_ccc_simd(x,y,z,len);
}
void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len) {
srslte_vec_prod_ccc_split_simd(x_re, x_im, y_re , y_im, z_re,z_im, len);
}
// PRACH, CHEST UL, etc.
void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
srslte_vec_prod_conj_ccc_simd(x,y,z,len);
@ -452,40 +423,17 @@ void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
//#define DIV_USE_VEC
// Used in SSS
/* Complex division is conjugate multiplication + real division */
void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
#ifdef DIV_USE_VEC
srslte_vec_prod_conj_ccc(x,y,z,len);
srslte_vec_abs_square_cf(y,y_mod,len);
srslte_vec_div_cfc(z,y_mod,z,z_real,z_imag,len);
#else
int i;
for (i=0;i<len;i++) {
z[i] = x[i] / y[i];
}
#endif
void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
srslte_vec_div_ccc_simd(x, y, z, len);
}
/* Complex division by float z=x/y */
void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
#ifdef DIV_USE_VEC
srslte_vec_deinterleave_cf(x, z_real, z_imag, len);
srslte_vec_div_fff(z_real, y, z_real, len);
srslte_vec_div_fff(z_imag, y, z_imag, len);
srslte_vec_interleave_cf(z_real, z_imag, z, len);
#else
int i;
for (i=0;i<len;i++) {
z[i] = x[i] / y[i];
}
#endif
void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
srslte_vec_div_cfc_simd(x, y, z, len);
}
void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
int i;
for (i=0;i<len;i++) {
z[i] = x[i] / y[i];
}
srslte_vec_div_fff_simd(x, y, z, len);
}
// PSS. convolution
@ -554,30 +502,7 @@ void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
}
uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
// This is to solve an issue with incorrect type of 1st parameter in version 1.2 of volk
#ifdef HAVE_VOLK_MAX_FUNCTION_32
uint32_t target=0;
volk_32f_index_max_32u(&target,x,len);
return target;
#else
#ifdef HAVE_VOLK_MAX_FUNCTION_16
uint32_t target=0;
volk_32f_index_max_16u(&target,x,len);
return target;
#else
uint32_t i;
float m=-FLT_MAX;
uint32_t p=0;
for (i=0;i<len;i++) {
if (x[i]>m) {
m=x[i];
p=i;
}
}
return p;
#endif
#endif
return srslte_vec_max_fi_simd(x, len);
}
int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
@ -616,30 +541,7 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {
// CP autocorr
uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_32
uint32_t target=0;
volk_32fc_index_max_32u(&target,x,len);
return target;
#else
#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_16
uint32_t target=0;
volk_32fc_index_max_16u(&target,x,len);
return target;
#else
uint32_t i;
float m=-FLT_MAX;
uint32_t p=0;
float tmp;
for (i=0;i<len;i++) {
tmp = crealf(x[i])*crealf(x[i]) + cimagf(x[i])*cimagf(x[i]);
if (tmp>m) {
m=tmp;
p=i;
}
}
return p;
#endif
#endif
return srslte_vec_max_ci_simd(x, len);
}
void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) {

View File

@ -232,143 +232,113 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
/* No improvement with AVX */
void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
{
#ifdef DEBUG_MODE
for (int i=0;i<len;i++) {
void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len) {
int i = 0;
#ifdef LV_HAVE_SSE
#if CMAKE_BUILD_TYPE!=Debug
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 7; i += 8) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
__m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
for (int k = 0; k < 8; k++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
y[l] = (short) x;
}
}
} else {
for (; i < len - 7; i += 8) {
__m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
__m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
for (int k = 0; k < 8; k++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
y[l] = (short) x;
}
}
}
#endif
#endif
for (; i < len; i++) {
y[lut[i]] = x[i];
}
#else
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
const __m128i* xPtr = (const __m128i*) x;
const __m128i* lutPtr = (__m128i*) lut;
__m128i xVal, lutVal;
for(;number < points; number++){
xVal = _mm_loadu_si128(xPtr);
lutVal = _mm_loadu_si128(lutPtr);
for (int i=0;i<8;i++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, i);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
y[l] = x;
}
xPtr ++;
lutPtr ++;
}
number = points * 8;
for(;number < len; number++){
y[lut[number]] = x[number];
}
#endif
#endif
}
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len)
{
#ifdef LV_HAVE_SSE
unsigned int number = 0;
void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len) {
int i = 0;
const unsigned int eighthPoints = len / 8;
#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
simd_f_t s = srslte_simd_f_set1(scale);
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_f_t a = srslte_simd_f_load(&x[i]);
simd_f_t b = srslte_simd_f_load(&x[i + SRSLTE_SIMD_F_SIZE]);
const float* inputVectorPtr = (const float*)x;
int16_t* outputVectorPtr = z;
simd_f_t sa = srslte_simd_f_mul(a, s);
simd_f_t sb = srslte_simd_f_mul(b, s);
__m128 vScalar = _mm_set_ps1(scale);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
for(;number < eighthPoints; number++){
inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
srslte_simd_s_store(&z[i], i16);
}
} else {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_f_t a = srslte_simd_f_loadu(&x[i]);
simd_f_t b = srslte_simd_f_loadu(&x[i + SRSLTE_SIMD_F_SIZE]);
ret1 = _mm_mul_ps(inputVal1, vScalar);
ret2 = _mm_mul_ps(inputVal2, vScalar);
simd_f_t sa = srslte_simd_f_mul(a, s);
simd_f_t sb = srslte_simd_f_mul(b, s);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
srslte_simd_s_storeu(&z[i], i16);
}
}
#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE */
number = eighthPoints * 8;
for(; number < len; number++){
z[number] = (int16_t) (x[number] * scale);
for(; i < len; i++){
z[i] = (int16_t) (x[i] * scale);
}
#endif
}
float srslte_vec_acc_ff_simd(float *x, int len) {
int i = 0;
float acc_sum = 0.0f;
// for enb no-volk
void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 4;
#if SRSLTE_SIMD_F_SIZE
simd_f_t simd_sum = srslte_simd_f_zero();
const float* xPtr = (const float*) x;
const float* yPtr = (const float*) y;
float* zPtr = (float*) z;
if (SRSLTE_IS_ALIGNED(x)) {
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
simd_f_t a = srslte_simd_f_load(&x[i]);
__m128 xVal, yVal, zVal;
for(;number < points; number++){
simd_sum = srslte_simd_f_add(simd_sum, a);
}
} else {
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
simd_f_t a = srslte_simd_f_loadu(&x[i]);
xVal = _mm_loadu_ps(xPtr);
yVal = _mm_loadu_ps(yPtr);
zVal = _mm_add_ps(xVal, yVal);
_mm_storeu_ps(zPtr, zVal);
xPtr += 4;
yPtr += 4;
zPtr += 4;
simd_sum = srslte_simd_f_add(simd_sum, a);
}
}
number = points * 4;
for(;number < len; number++){
z[number] = x[number] + y[number];
__attribute__((aligned(SRSLTE_SIMD_F_SIZE*4))) float sum[SRSLTE_SIMD_F_SIZE];
srslte_simd_f_store(sum, simd_sum);
for (int k = 0; k < SRSLTE_SIMD_F_SIZE; k++) {
acc_sum += sum[k];
}
#endif
}
void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_AVX
unsigned int number = 0;
const unsigned int points = len / 8;
const float* xPtr = (const float*) x;
const float* yPtr = (const float*) y;
float* zPtr = (float*) z;
__m256 xVal, yVal, zVal;
for(;number < points; number++){
xVal = _mm256_loadu_ps(xPtr);
yVal = _mm256_loadu_ps(yPtr);
zVal = _mm256_add_ps(xVal, yVal);
_mm256_storeu_ps(zPtr, zVal);
xPtr += 8;
yPtr += 8;
zPtr += 8;
for (; i<len; i++) {
acc_sum += x[i];
}
for(number = points * 8;number < len; number++){
z[number] = x[number] + y[number];
}
#endif
return acc_sum;
}
cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) {
@ -570,6 +540,34 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len)
return result;
}
void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len) {
int i = 0;
#if SRSLTE_SIMD_CF_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_f_t s = srslte_simd_f_load(&y[i]);
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
simd_cf_t r = srslte_simd_cf_mul(a, s);
srslte_simd_cfi_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
simd_f_t s = srslte_simd_f_loadu(&y[i]);
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
simd_cf_t r = srslte_simd_cf_mul(a, s);
srslte_simd_cfi_storeu(&z[i], r);
}
}
#endif
for (; i<len; i++) {
z[i] = x[i] * y[i];
}
}
void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len) {
int i = 0;
@ -630,17 +628,29 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
}
}
void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
int i = 0;
#if SRSLTE_SIMD_F_SIZE
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
simd_cf_t r = srslte_simd_cf_prod(a, b);
simd_cf_t r = srslte_simd_cf_prod(a, b);
srslte_simd_cf_store(&r_re[i], &r_im[i], r);
srslte_simd_cf_store(&r_re[i], &r_im[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cf_loadu(&a_re[i], &a_im[i]);
simd_cf_t b = srslte_simd_cf_loadu(&b_re[i], &b_im[i]);
simd_cf_t r = srslte_simd_cf_prod(a, b);
srslte_simd_cf_storeu(&r_re[i], &r_im[i], r);
}
}
#endif
@ -655,13 +665,25 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i
int i = 0;
#if SRSLTE_SIMD_C16_SIZE
for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
simd_c16_t r = srslte_simd_c16_prod(a, b);
simd_c16_t r = srslte_simd_c16_prod(a, b);
srslte_simd_c16_store(&r_re[i], &r_im[i], r);
srslte_simd_c16_store(&r_re[i], &r_im[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
simd_c16_t a = srslte_simd_c16_loadu(&a_re[i], &a_im[i]);
simd_c16_t b = srslte_simd_c16_loadu(&b_re[i], &b_im[i]);
simd_c16_t r = srslte_simd_c16_prod(a, b);
srslte_simd_c16_storeu(&r_re[i], &r_im[i], r);
}
}
#endif
@ -701,6 +723,103 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
}
}
void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
int i = 0;
#if SRSLTE_SIMD_CF_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
simd_cf_t b = srslte_simd_cfi_load(&y[i]);
simd_cf_t rcpb = srslte_simd_cf_rcp(b);
simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
srslte_simd_cfi_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
simd_cf_t rcpb = srslte_simd_cf_rcp(b);
simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
srslte_simd_cfi_storeu(&z[i], r);
}
}
#endif
for (; i < len; i++) {
z[i] = x[i] / y[i];
}
}
void srslte_vec_div_cfc_simd(cf_t *x,float *y, cf_t *z, int len) {
int i = 0;
#if SRSLTE_SIMD_CF_SIZE && SRSLTE_SIMD_CF_SIZE == SRSLTE_SIMD_F_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
simd_f_t b = srslte_simd_f_load(&y[i]);
simd_f_t rcpb = srslte_simd_f_rcp(b);
simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
srslte_simd_cfi_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
simd_f_t b = srslte_simd_f_loadu(&y[i]);
simd_f_t rcpb = srslte_simd_f_rcp(b);
simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
srslte_simd_cfi_storeu(&z[i], r);
}
}
#endif
for (; i < len; i++) {
z[i] = x[i] / y[i];
}
}
void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) {
int i = 0;
#if SRSLTE_SIMD_F_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
simd_f_t a = srslte_simd_f_load(&x[i]);
simd_f_t b = srslte_simd_f_load(&y[i]);
simd_f_t rcpb = srslte_simd_f_rcp(b);
simd_f_t r = srslte_simd_f_mul(a, rcpb);
srslte_simd_f_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
simd_f_t a = srslte_simd_f_loadu(&x[i]);
simd_f_t b = srslte_simd_f_loadu(&y[i]);
simd_f_t rcpb = srslte_simd_f_rcp(b);
simd_f_t r = srslte_simd_f_mul(a, rcpb);
srslte_simd_f_storeu(&z[i], r);
}
}
#endif
for (; i < len; i++) {
z[i] = x[i] / y[i];
}
}
void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
int i = 0;
@ -895,3 +1014,137 @@ void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len) {
dst[i] = src[i];
}
}
uint32_t srslte_vec_max_fi_simd(float *x, int len) {
int i = 0;
float max_value = -INFINITY;
uint32_t max_index = 0;
#if SRSLTE_SIMD_I_SIZE
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
if (SRSLTE_IS_ALIGNED(x)) {
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
simd_f_t a = srslte_simd_f_load(&x[i]);
simd_i_t res = srslte_simd_f_max(a, simd_max_values);
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
}
} else {
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
simd_f_t a = srslte_simd_f_loadu(&x[i]);
simd_i_t res = srslte_simd_f_max(a, simd_max_values);
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
}
}
srslte_simd_i_store(indexes_buffer, simd_max_indexes);
srslte_simd_f_store(values_buffer, simd_max_values);
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
if (values_buffer[k] > max_value) {
max_value = values_buffer[k];
max_index = (uint32_t) indexes_buffer[k];
}
}
#endif /* SRSLTE_SIMD_I_SIZE */
for (; i < len; i++) {
if (x[i] > max_value) {
max_value = x[i];
max_index = (uint32_t)i;
}
}
return max_index;
}
uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
int i = 0;
float max_value = -INFINITY;
uint32_t max_index = 0;
#if SRSLTE_SIMD_I_SIZE
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
if (SRSLTE_IS_ALIGNED(x)) {
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
}
} else {
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
}
}
srslte_simd_i_store(indexes_buffer, simd_max_indexes);
srslte_simd_f_store(values_buffer, simd_max_values);
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
if (values_buffer[k] > max_value) {
max_value = values_buffer[k];
max_index = (uint32_t) indexes_buffer[k];
}
}
#endif /* SRSLTE_SIMD_I_SIZE */
for (; i < len; i++) {
cf_t a = x[i];
float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
if (abs2 > max_value) {
max_value = abs2;
max_index = (uint32_t)i;
}
}
return max_index;
}