diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 416319dcc..ba71b0aac 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -95,6 +95,27 @@ typedef _Complex float cf_t; A, _mm256_moveldup_ps(B), _mm256_fmsubadd_ps(_mm256_shuffle_ps(A, A, 0xB1), _mm256_movehdup_ps(B), C)) #endif /* LV_HAVE_FMA */ +/* + * SIMD Vector bit alignment + */ +#ifdef LV_HAVE_AVX512 +#define SRSLTE_SIMD_BIT_ALIGN 512 +#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR)&0x3F) == 0) +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX +#define SRSLTE_SIMD_BIT_ALIGN 256 +#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR)&0x1F) == 0) +#else /* LV_HAVE_AVX */ +#ifdef LV_HAVE_SSE +#define SRSLTE_SIMD_BIT_ALIGN 128 +#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR)&0x0F) == 0) +#else /* LV_HAVE_SSE */ +#define SRSLTE_SIMD_BIT_ALIGN 64 +#define SRSLTE_IS_ALIGNED(PTR) (1) +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX */ +#endif /* LV_HAVE_AVX512 */ + /* Memory Sizes for Single Floating Point and fixed point */ #ifdef LV_HAVE_AVX512 @@ -546,6 +567,19 @@ static inline simd_f_t srslte_simd_f_abs(simd_f_t a) #endif /* LV_HAVE_AVX512 */ } +static inline void srslte_simd_f_fprintf(FILE* stream, simd_f_t a) +{ + float x[SRSLTE_SIMD_F_SIZE]; + + srslte_simd_f_storeu(x, a); + + fprintf(stream, "["); + for (int i = 0; i < SRSLTE_SIMD_F_SIZE; i++) { + fprintf(stream, "%+2.5f, ", x[i]); + } + fprintf(stream, "];\n"); +} + #endif /* SRSLTE_SIMD_F_SIZE */ #if SRSLTE_SIMD_CF_SIZE @@ -1110,6 +1144,19 @@ static inline simd_cf_t srslte_simd_cf_zero(void) return ret; } +static inline void srslte_simd_cf_fprintf(FILE* stream, simd_cf_t a) +{ + cf_t x[SRSLTE_SIMD_CF_SIZE]; + + srslte_simd_cfi_storeu(x, a); + + fprintf(stream, "["); + for (int i = 0; i < SRSLTE_SIMD_CF_SIZE; i++) { + fprintf(stream, "%+2.5f%+2.5fi, ", __real__ x[i], __imag__ x[i]); + } + fprintf(stream, "];\n"); +} + #endif /* SRSLTE_SIMD_CF_SIZE */ #if SRSLTE_SIMD_I_SIZE @@ -1267,6 +1314,56 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) #endif /* LV_HAVE_AVX512 */ } +static inline simd_sel_t srslte_simd_f_min(simd_f_t a, simd_f_t b) +{ +#ifdef LV_HAVE_AVX512 + return _mm512_cmp_ps_mask(a, b, _CMP_LT_OS); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_cmp_ps(a, b, _CMP_LT_OS); +#else /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_SSE + return (simd_sel_t)_mm_cmplt_ps(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return (simd_sel_t)vcltq_f32(a, b); +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_f_t srslte_simd_f_select(simd_f_t a, simd_f_t b, simd_sel_t sel) +{ +#ifdef LV_HAVE_AVX512 + return _mm512_mask_blend_ps(sel, (__m512)a, (__m512)b); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_blendv_ps(a, b, sel); +#else +#ifdef LV_HAVE_SSE + return _mm_blendv_ps(a, b, sel); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON + float* a_ptr = (float*)&a; + float* b_ptr = (float*)&b; + simd_i_t ret; + int* sel = (int*)&selector; + float* c_ptr = (float*)&ret; + for (int i = 0; i < 4; i++) { + if (sel[i] == -1) { + c_ptr[i] = b_ptr[i]; + } else { + c_ptr[i] = a_ptr[i]; + } + } + return ret; +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t selector) { #ifdef LV_HAVE_AVX512 diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h index 99f3337cf..1f1ead93f 100644 --- a/lib/include/srslte/phy/utils/vector_simd.h +++ b/lib/include/srslte/phy/utils/vector_simd.h @@ -30,24 +30,6 @@ extern "C" { #include #include -#ifdef LV_HAVE_AVX512 -#define SRSLTE_SIMD_BIT_ALIGN 512 -#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR)&0x3F) == 0) -#else /* LV_HAVE_AVX512 */ -#ifdef LV_HAVE_AVX -#define SRSLTE_SIMD_BIT_ALIGN 256 -#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR)&0x1F) == 0) -#else /* LV_HAVE_AVX */ -#ifdef LV_HAVE_SSE -#define SRSLTE_SIMD_BIT_ALIGN 128 -#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR)&0x0F) == 0) -#else /* LV_HAVE_SSE */ -#define SRSLTE_SIMD_BIT_ALIGN 64 -#define SRSLTE_IS_ALIGNED(PTR) (1) -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ -#endif /* LV_HAVE_AVX512 */ - /*SIMD Logical operations*/ SRSLTE_API void srslte_vec_xor_bbb_simd(const int8_t* x, const int8_t* y, int8_t* z, int len); diff --git a/lib/src/phy/mimo/precoding.c b/lib/src/phy/mimo/precoding.c index 5c15eedb8..d72801cf6 100644 --- a/lib/src/phy/mimo/precoding.c +++ b/lib/src/phy/mimo/precoding.c @@ -2357,232 +2357,127 @@ int srslte_precoding_pmi_select_1l_gen(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_ return i; } -#ifdef LV_HAVE_SSE +#ifdef SRSLTE_SIMD_CF_SIZE /* PMI Select for 1 layer */ -int srslte_precoding_pmi_select_1l_sse(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - uint32_t nof_symbols, - float noise_estimate, - uint32_t* pmi, - float sinr_list[SRSLTE_MAX_CODEBOOKS]) +int srslte_precoding_pmi_select_1l_simd(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + uint32_t nof_symbols, + float noise_estimate, + uint32_t* pmi, + float sinr_list[SRSLTE_MAX_CODEBOOKS]) { - float max_sinr = 0.0; - uint32_t i, count; - __m128 sse_norm = _mm_set1_ps(0.5f); + float max_sinr = 0.0; + simd_f_t simd_f_norm = srslte_simd_f_set1(0.5f); - for (i = 0; i < 4; i++) { - sinr_list[i] = 0; - count = 0; + for (uint32_t i = 0; i < 4; i++) { + float sinr_acc = 0; + float count = 0; - for (uint32_t j = 0; j < nof_symbols - PMI_SEL_PRECISION * 2 + 1; j += PMI_SEL_PRECISION * 2) { - /* 0. Load channel matrix */ - __m128 h00 = _mm_set_ps(crealf(h[0][0][j]), - cimagf(h[0][0][j]), - crealf(h[0][0][j + PMI_SEL_PRECISION]), - cimagf(h[0][0][j + PMI_SEL_PRECISION])); - __m128 h01 = _mm_set_ps(crealf(h[1][0][j]), - cimagf(h[1][0][j]), - crealf(h[1][0][j + PMI_SEL_PRECISION]), - cimagf(h[1][0][j + PMI_SEL_PRECISION])); - __m128 h10 = _mm_set_ps(crealf(h[0][1][j]), - cimagf(h[0][1][j]), - crealf(h[0][1][j + PMI_SEL_PRECISION]), - cimagf(h[0][1][j + PMI_SEL_PRECISION])); - __m128 h11 = _mm_set_ps(crealf(h[1][1][j]), - cimagf(h[1][1][j]), - crealf(h[1][1][j + PMI_SEL_PRECISION]), - cimagf(h[1][1][j + PMI_SEL_PRECISION])); + for (uint32_t j = 0; j < nof_symbols - PMI_SEL_PRECISION * SRSLTE_SIMD_CF_SIZE + 1; + j += PMI_SEL_PRECISION * SRSLTE_SIMD_CF_SIZE) { + // 0. Load channel matrix + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h00_v[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h01_v[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h10_v[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h11_v[SRSLTE_SIMD_CF_SIZE]; + + for (uint32_t k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) { + h00_v[k] = h[0][0][j + PMI_SEL_PRECISION * k]; + h01_v[k] = h[1][0][j + PMI_SEL_PRECISION * k]; + h10_v[k] = h[0][1][j + PMI_SEL_PRECISION * k]; + h11_v[k] = h[1][1][j + PMI_SEL_PRECISION * k]; + } + + simd_cf_t h00 = srslte_simd_cfi_load(h00_v); + simd_cf_t h01 = srslte_simd_cfi_load(h01_v); + simd_cf_t h10 = srslte_simd_cfi_load(h10_v); + simd_cf_t h11 = srslte_simd_cfi_load(h11_v); /* 1. B = W'* H' */ - __m128 a0, a1; + simd_cf_t a0, a1; switch (i) { case 0: - a0 = _mm_add_ps(_MM_CONJ_PS(h00), _MM_CONJ_PS(h01)); - a1 = _mm_add_ps(_MM_CONJ_PS(h10), _MM_CONJ_PS(h11)); + a0 = srslte_simd_cf_add(srslte_simd_cf_conj(h00), srslte_simd_cf_conj(h01)); + a1 = srslte_simd_cf_add(srslte_simd_cf_conj(h10), srslte_simd_cf_conj(h11)); break; case 1: - a0 = _mm_sub_ps(_MM_CONJ_PS(h00), _MM_CONJ_PS(h01)); - a1 = _mm_sub_ps(_MM_CONJ_PS(h10), _MM_CONJ_PS(h11)); + a0 = srslte_simd_cf_sub(srslte_simd_cf_conj(h00), srslte_simd_cf_conj(h01)); + a1 = srslte_simd_cf_sub(srslte_simd_cf_conj(h10), srslte_simd_cf_conj(h11)); break; case 2: - a0 = _mm_add_ps(_MM_CONJ_PS(h00), _MM_MULJ_PS(_MM_CONJ_PS(h01))); - a1 = _mm_add_ps(_MM_CONJ_PS(h10), _MM_MULJ_PS(_MM_CONJ_PS(h11))); + a0 = srslte_simd_cf_sub(srslte_simd_cf_conj(h00), srslte_simd_cf_mulj(srslte_simd_cf_conj(h01))); + a1 = srslte_simd_cf_sub(srslte_simd_cf_conj(h10), srslte_simd_cf_mulj(srslte_simd_cf_conj(h11))); break; - case 3: - a0 = _mm_sub_ps(_MM_CONJ_PS(h00), _MM_MULJ_PS(_MM_CONJ_PS(h01))); - a1 = _mm_sub_ps(_MM_CONJ_PS(h10), _MM_MULJ_PS(_MM_CONJ_PS(h11))); + default: + a0 = srslte_simd_cf_add(srslte_simd_cf_conj(h00), srslte_simd_cf_mulj(srslte_simd_cf_conj(h01))); + a1 = srslte_simd_cf_add(srslte_simd_cf_conj(h10), srslte_simd_cf_mulj(srslte_simd_cf_conj(h11))); break; } /* 2. B = W' * H' * H = A * H */ - __m128 b0 = _mm_add_ps(_MM_PROD_PS(a0, h00), _MM_PROD_PS(a1, h10)); - __m128 b1 = _mm_add_ps(_MM_PROD_PS(a0, h01), _MM_PROD_PS(a1, h11)); + simd_cf_t b0 = srslte_simd_cf_add(srslte_simd_cf_prod(a0, h00), srslte_simd_cf_prod(a1, h10)); + simd_cf_t b1 = srslte_simd_cf_add(srslte_simd_cf_prod(a0, h01), srslte_simd_cf_prod(a1, h11)); /* 3. C = W' * H' * H * W' = B * W */ - __m128 c; + simd_cf_t c; switch (i) { case 0: - c = _mm_add_ps(b0, b1); + c = srslte_simd_cf_add(b0, b1); break; case 1: - c = _mm_sub_ps(b0, b1); + c = srslte_simd_cf_sub(b0, b1); break; case 2: - c = _mm_sub_ps(b0, _MM_MULJ_PS(b1)); + c = srslte_simd_cf_add(b0, srslte_simd_cf_mulj(b1)); break; case 3: - c = _mm_add_ps(b0, _MM_MULJ_PS(b1)); + c = srslte_simd_cf_sub(b0, srslte_simd_cf_mulj(b1)); break; default: return SRSLTE_ERROR; } - c = _mm_mul_ps(c, sse_norm); - /* Add for averaging */ - __attribute__((aligned(128))) float gamma[4]; - _mm_store_ps(gamma, c); - sinr_list[i] += gamma[0] + gamma[2]; + simd_f_t gamma = srslte_simd_f_mul(srslte_simd_cf_re(c), simd_f_norm); - count += 2; + // Horizontal accumulation + for (int k = 1; k < SRSLTE_SIMD_F_SIZE; k *= 2) { + gamma = srslte_simd_f_hadd(gamma, gamma); + } + + // Temporal store accumulated values + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) float v[SRSLTE_SIMD_F_SIZE]; + srslte_simd_f_store(v, gamma); + + // Average and accumulate SINR loop + sinr_acc += (v[0] / SRSLTE_SIMD_CF_SIZE); + + // Increase loop counter + count += 1; } - /* Divide average by noise */ - sinr_list[i] /= noise_estimate * count; + // Average accumulated SINR + if (count) { + sinr_acc /= (noise_estimate * count); + } else { + sinr_acc = 1e+9f; + } - if (sinr_list[i] > max_sinr) { - max_sinr = sinr_list[i]; + // Save SINR if available + if (sinr_list) { + sinr_list[i] = sinr_acc; + } + + // Select maximum SINR Codebook + if (pmi && sinr_acc > max_sinr) { + max_sinr = sinr_acc; *pmi = i; } } - return i; + return 4; } -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX - -/* PMI Select for 1 layer */ -int srslte_precoding_pmi_select_1l_avx(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - uint32_t nof_symbols, - float noise_estimate, - uint32_t* pmi, - float sinr_list[SRSLTE_MAX_CODEBOOKS]) -{ - float max_sinr = 0.0; - uint32_t i, count; - __m256 avx_norm = _mm256_set1_ps(0.5f); - - for (i = 0; i < 4; i++) { - sinr_list[i] = 0; - count = 0; - - for (uint32_t j = 0; j < nof_symbols - PMI_SEL_PRECISION * 4 + 1; j += PMI_SEL_PRECISION * 4) { - /* 0. Load channel matrix */ - __m256 h00 = _mm256_setr_ps(crealf(h[0][0][j]), - cimagf(h[0][0][j]), - crealf(h[0][0][j + PMI_SEL_PRECISION]), - cimagf(h[0][0][j + PMI_SEL_PRECISION]), - crealf(h[0][0][j + PMI_SEL_PRECISION * 2]), - cimagf(h[0][0][j + PMI_SEL_PRECISION * 2]), - crealf(h[0][0][j + PMI_SEL_PRECISION * 3]), - cimagf(h[0][0][j + PMI_SEL_PRECISION * 3])); - __m256 h01 = _mm256_setr_ps(crealf(h[1][0][j]), - cimagf(h[1][0][j]), - crealf(h[1][0][j + PMI_SEL_PRECISION]), - cimagf(h[1][0][j + PMI_SEL_PRECISION]), - crealf(h[1][0][j + PMI_SEL_PRECISION * 2]), - cimagf(h[1][0][j + PMI_SEL_PRECISION * 2]), - crealf(h[1][0][j + PMI_SEL_PRECISION * 3]), - cimagf(h[1][0][j + PMI_SEL_PRECISION * 3])); - __m256 h10 = _mm256_setr_ps(crealf(h[0][1][j]), - cimagf(h[0][1][j]), - crealf(h[0][1][j + PMI_SEL_PRECISION]), - cimagf(h[0][1][j + PMI_SEL_PRECISION]), - crealf(h[0][1][j + PMI_SEL_PRECISION * 2]), - cimagf(h[0][1][j + PMI_SEL_PRECISION * 2]), - crealf(h[0][1][j + PMI_SEL_PRECISION * 3]), - cimagf(h[0][1][j + PMI_SEL_PRECISION * 3])); - __m256 h11 = _mm256_setr_ps(crealf(h[1][1][j]), - cimagf(h[1][1][j]), - crealf(h[1][1][j + PMI_SEL_PRECISION]), - cimagf(h[1][1][j + PMI_SEL_PRECISION]), - crealf(h[1][1][j + PMI_SEL_PRECISION * 2]), - cimagf(h[1][1][j + PMI_SEL_PRECISION * 2]), - crealf(h[1][1][j + PMI_SEL_PRECISION * 3]), - cimagf(h[1][1][j + PMI_SEL_PRECISION * 3])); - - /* 1. B = W'* H' */ - __m256 a0, a1; - switch (i) { - case 0: - a0 = _mm256_add_ps(_MM256_CONJ_PS(h00), _MM256_CONJ_PS(h01)); - a1 = _mm256_add_ps(_MM256_CONJ_PS(h10), _MM256_CONJ_PS(h11)); - break; - case 1: - a0 = _mm256_sub_ps(_MM256_CONJ_PS(h00), _MM256_CONJ_PS(h01)); - a1 = _mm256_sub_ps(_MM256_CONJ_PS(h10), _MM256_CONJ_PS(h11)); - break; - case 2: - a0 = _mm256_sub_ps(_MM256_CONJ_PS(h00), _MM256_MULJ_PS(_MM256_CONJ_PS(h01))); - a1 = _mm256_sub_ps(_MM256_CONJ_PS(h10), _MM256_MULJ_PS(_MM256_CONJ_PS(h11))); - break; - default: - a0 = _mm256_add_ps(_MM256_CONJ_PS(h00), _MM256_MULJ_PS(_MM256_CONJ_PS(h01))); - a1 = _mm256_add_ps(_MM256_CONJ_PS(h10), _MM256_MULJ_PS(_MM256_CONJ_PS(h11))); - break; - } - - /* 2. B = W' * H' * H = A * H */ -#ifdef LV_HAVE_FMA - __m256 b0 = _MM256_PROD_ADD_PS(a0, h00, _MM256_PROD_PS(a1, h10)); - __m256 b1 = _MM256_PROD_ADD_PS(a0, h01, _MM256_PROD_PS(a1, h11)); -#else - __m256 b0 = _mm256_add_ps(_MM256_PROD_PS(a0, h00), _MM256_PROD_PS(a1, h10)); - __m256 b1 = _mm256_add_ps(_MM256_PROD_PS(a0, h01), _MM256_PROD_PS(a1, h11)); -#endif /* LV_HAVE_FMA */ - - /* 3. C = W' * H' * H * W' = B * W */ - __m256 c; - switch (i) { - case 0: - c = _mm256_add_ps(b0, b1); - break; - case 1: - c = _mm256_sub_ps(b0, b1); - break; - case 2: - c = _mm256_add_ps(b0, _MM256_MULJ_PS(b1)); - break; - case 3: - c = _mm256_sub_ps(b0, _MM256_MULJ_PS(b1)); - break; - default: - return SRSLTE_ERROR; - } - c = _mm256_mul_ps(c, avx_norm); - - /* Add for averaging */ - __attribute__((aligned(256))) float gamma[8]; - _mm256_store_ps(gamma, c); - sinr_list[i] += gamma[0] + gamma[2] + gamma[4] + gamma[6]; - - count += 4; - } - - /* Divide average by noise */ - sinr_list[i] /= noise_estimate * count; - - if (sinr_list[i] > max_sinr) { - max_sinr = sinr_list[i]; - *pmi = i; - } - } - - return i; -} - -#endif /* LV_HAVE_AVX */ +#endif /* SRSLTE_SIMD_CF_SIZE */ int srslte_precoding_pmi_select_1l(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], uint32_t nof_symbols, @@ -2591,15 +2486,11 @@ int srslte_precoding_pmi_select_1l(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORT float sinr_list[SRSLTE_MAX_CODEBOOKS]) { int ret; -#ifdef LV_HAVE_AVX - ret = srslte_precoding_pmi_select_1l_avx(h, nof_symbols, noise_estimate, pmi, sinr_list); -#else -#ifdef LV_HAVE_SSE - ret = srslte_precoding_pmi_select_1l_sse(h, nof_symbols, noise_estimate, pmi, sinr_list); +#ifdef SRSLTE_SIMD_CF_SIZE + ret = srslte_precoding_pmi_select_1l_simd(h, nof_symbols, noise_estimate, pmi, sinr_list); #else ret = srslte_precoding_pmi_select_1l_gen(h, nof_symbols, noise_estimate, pmi, sinr_list); -#endif -#endif +#endif /* SRSLTE_SIMD_CF_SIZE */ INFO("Precoder PMI Select for 1 layer SINR=[%.1fdB; %.1fdB; %.1fdB; %.1fdB] PMI=%d\n", srslte_convert_power_to_dB(sinr_list[0]), srslte_convert_power_to_dB(sinr_list[1]), @@ -2713,285 +2604,161 @@ int srslte_precoding_pmi_select_2l_gen(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_ return i; } -#ifdef LV_HAVE_SSE +#ifdef SRSLTE_SIMD_CF_SIZE -int srslte_precoding_pmi_select_2l_sse(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - uint32_t nof_symbols, - float noise_estimate, - uint32_t* pmi, - float sinr_list[SRSLTE_MAX_CODEBOOKS]) +int srslte_precoding_pmi_select_2l_simd(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + int nof_symbols, + float noise_estimate, + uint32_t* pmi, + float sinr_list[SRSLTE_MAX_CODEBOOKS]) { + // SIMD Constants + const simd_cf_t simd_cf_noise_estimate = srslte_simd_cf_set1(noise_estimate); + const simd_f_t simd_f_noise_estimate = srslte_simd_f_set1(noise_estimate); + const simd_f_t simd_f_norm = srslte_simd_f_set1(0.25f); + const simd_f_t simd_f_ones = srslte_simd_f_set1(1.0f); + const simd_f_t simd_f_det_min = srslte_simd_f_set1(1e-10f); + const simd_f_t simd_f_gamma_min = srslte_simd_f_set1(1e-9f); - float max_sinr = 0.0; - uint32_t i, count; + float max_sinr = 0.0f; - __m128 sse_noise_estimate = _mm_setr_ps(noise_estimate, 0.0f, noise_estimate, 0.0f); - __m128 sse_norm = _mm_set1_ps(0.25f); - __m128 sse_ones = _mm_set1_ps(1.0f); + for (uint32_t i = 0; i < 2; i++) { + float count = 0.0f; + float sinr_acc = 0.0f; - for (i = 0; i < 2; i++) { - sinr_list[i] = 0; - count = 0; + for (uint32_t j = 0; j < nof_symbols - PMI_SEL_PRECISION * SRSLTE_SIMD_CF_SIZE + 1; + j += PMI_SEL_PRECISION * SRSLTE_SIMD_CF_SIZE) { + // 0. Load channel matrix + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h00_v[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h01_v[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h10_v[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) cf_t h11_v[SRSLTE_SIMD_CF_SIZE]; - for (uint32_t j = 0; j < nof_symbols - PMI_SEL_PRECISION * 2 + 1; j += PMI_SEL_PRECISION * 2) { - /* 0. Load channel matrix */ - __m128 h00 = _mm_setr_ps(crealf(h[0][0][j]), - cimagf(h[0][0][j]), - crealf(h[0][0][j + PMI_SEL_PRECISION]), - cimagf(h[0][0][j + PMI_SEL_PRECISION])); - __m128 h01 = _mm_setr_ps(crealf(h[1][0][j]), - cimagf(h[1][0][j]), - crealf(h[1][0][j + PMI_SEL_PRECISION]), - cimagf(h[1][0][j + PMI_SEL_PRECISION])); - __m128 h10 = _mm_setr_ps(crealf(h[0][1][j]), - cimagf(h[0][1][j]), - crealf(h[0][1][j + PMI_SEL_PRECISION]), - cimagf(h[0][1][j + PMI_SEL_PRECISION])); - __m128 h11 = _mm_setr_ps(crealf(h[1][1][j]), - cimagf(h[1][1][j]), - crealf(h[1][1][j + PMI_SEL_PRECISION]), - cimagf(h[1][1][j + PMI_SEL_PRECISION])); + for (uint32_t k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) { + h00_v[k] = h[0][0][j + PMI_SEL_PRECISION * k]; + h01_v[k] = h[1][0][j + PMI_SEL_PRECISION * k]; + h10_v[k] = h[0][1][j + PMI_SEL_PRECISION * k]; + h11_v[k] = h[1][1][j + PMI_SEL_PRECISION * k]; + } - /* 1. B = W'* H' */ - __m128 a00, a01, a10, a11; + simd_cf_t h00 = srslte_simd_cfi_load(h00_v); + simd_cf_t h01 = srslte_simd_cfi_load(h01_v); + simd_cf_t h10 = srslte_simd_cfi_load(h10_v); + simd_cf_t h11 = srslte_simd_cfi_load(h11_v); + + // 1. B = W'* H' + simd_cf_t a00, a01, a10, a11; switch (i) { case 0: - a00 = _mm_add_ps(_MM_CONJ_PS(h00), _MM_CONJ_PS(h01)); - a01 = _mm_add_ps(_MM_CONJ_PS(h10), _MM_CONJ_PS(h11)); - a10 = _mm_sub_ps(_MM_CONJ_PS(h00), _MM_CONJ_PS(h01)); - a11 = _mm_sub_ps(_MM_CONJ_PS(h10), _MM_CONJ_PS(h11)); + a00 = srslte_simd_cf_add(srslte_simd_cf_conj(h00), srslte_simd_cf_conj(h01)); + a01 = srslte_simd_cf_add(srslte_simd_cf_conj(h10), srslte_simd_cf_conj(h11)); + a10 = srslte_simd_cf_sub(srslte_simd_cf_conj(h00), srslte_simd_cf_conj(h01)); + a11 = srslte_simd_cf_sub(srslte_simd_cf_conj(h10), srslte_simd_cf_conj(h11)); break; case 1: - a00 = _mm_sub_ps(_MM_CONJ_PS(h00), _MM_MULJ_PS(_MM_CONJ_PS(h01))); - a01 = _mm_sub_ps(_MM_CONJ_PS(h10), _MM_MULJ_PS(_MM_CONJ_PS(h11))); - a10 = _mm_add_ps(_MM_CONJ_PS(h00), _MM_MULJ_PS(_MM_CONJ_PS(h01))); - a11 = _mm_add_ps(_MM_CONJ_PS(h10), _MM_MULJ_PS(_MM_CONJ_PS(h11))); + a00 = srslte_simd_cf_sub(srslte_simd_cf_conj(h00), srslte_simd_cf_mulj(srslte_simd_cf_conj(h01))); + a01 = srslte_simd_cf_sub(srslte_simd_cf_conj(h10), srslte_simd_cf_mulj(srslte_simd_cf_conj(h11))); + a10 = srslte_simd_cf_add(srslte_simd_cf_conj(h00), srslte_simd_cf_mulj(srslte_simd_cf_conj(h01))); + a11 = srslte_simd_cf_add(srslte_simd_cf_conj(h10), srslte_simd_cf_mulj(srslte_simd_cf_conj(h11))); break; default: return SRSLTE_ERROR; } - /* 2. B = W' * H' * H = A * H */ - __m128 b00 = _mm_add_ps(_MM_PROD_PS(a00, h00), _MM_PROD_PS(a01, h10)); - __m128 b01 = _mm_add_ps(_MM_PROD_PS(a00, h01), _MM_PROD_PS(a01, h11)); - __m128 b10 = _mm_add_ps(_MM_PROD_PS(a10, h00), _MM_PROD_PS(a11, h10)); - __m128 b11 = _mm_add_ps(_MM_PROD_PS(a10, h01), _MM_PROD_PS(a11, h11)); + // 2. B = W' * H' * H = A * H + simd_cf_t b00 = srslte_simd_cf_add(srslte_simd_cf_prod(a00, h00), srslte_simd_cf_prod(a01, h10)); + simd_cf_t b01 = srslte_simd_cf_add(srslte_simd_cf_prod(a00, h01), srslte_simd_cf_prod(a01, h11)); + simd_cf_t b10 = srslte_simd_cf_add(srslte_simd_cf_prod(a10, h00), srslte_simd_cf_prod(a11, h10)); + simd_cf_t b11 = srslte_simd_cf_add(srslte_simd_cf_prod(a10, h01), srslte_simd_cf_prod(a11, h11)); - /* 3. C = W' * H' * H * W' = B * W */ - __m128 c00, c01, c10, c11; + // 3. C = W' * H' * H * W' = B * W + simd_cf_t c00, c01, c10, c11; switch (i) { case 0: - c00 = _mm_add_ps(b00, b01); - c01 = _mm_sub_ps(b00, b01); - c10 = _mm_add_ps(b10, b11); - c11 = _mm_sub_ps(b10, b11); + c00 = srslte_simd_cf_add(b00, b01); + c01 = srslte_simd_cf_sub(b00, b01); + c10 = srslte_simd_cf_add(b10, b11); + c11 = srslte_simd_cf_sub(b10, b11); break; case 1: - c00 = _mm_add_ps(b00, _MM_MULJ_PS(b01)); - c01 = _mm_sub_ps(b00, _MM_MULJ_PS(b01)); - c10 = _mm_add_ps(b10, _MM_MULJ_PS(b11)); - c11 = _mm_sub_ps(b10, _MM_MULJ_PS(b11)); + c00 = srslte_simd_cf_add(b00, srslte_simd_cf_mulj(b01)); + c01 = srslte_simd_cf_sub(b00, srslte_simd_cf_mulj(b01)); + c10 = srslte_simd_cf_add(b10, srslte_simd_cf_mulj(b11)); + c11 = srslte_simd_cf_sub(b10, srslte_simd_cf_mulj(b11)); break; default: return SRSLTE_ERROR; } - c00 = _mm_mul_ps(c00, sse_norm); - c01 = _mm_mul_ps(c01, sse_norm); - c10 = _mm_mul_ps(c10, sse_norm); - c11 = _mm_mul_ps(c11, sse_norm); + c00 = srslte_simd_cf_mul(c00, simd_f_norm); + c01 = srslte_simd_cf_mul(c01, simd_f_norm); + c10 = srslte_simd_cf_mul(c10, simd_f_norm); + c11 = srslte_simd_cf_mul(c11, simd_f_norm); - /* 4. C += noise * I */ - c00 = _mm_add_ps(c00, sse_noise_estimate); - c11 = _mm_add_ps(c11, sse_noise_estimate); + // 4. C += noise * I + c00 = srslte_simd_cf_add(c00, simd_cf_noise_estimate); + c11 = srslte_simd_cf_add(c11, simd_cf_noise_estimate); - /* 5. detC */ - __m128 detC = srslte_mat_2x2_det_sse(c00, c01, c10, c11); - __m128 inv_detC = srslte_mat_cf_recip_sse(detC); - inv_detC = _mm_mul_ps(sse_noise_estimate, inv_detC); + // 5. detC + simd_f_t detC = srslte_simd_cf_re(srslte_mat_2x2_det_simd(c00, c01, c10, c11)); - __m128 den0 = _MM_PROD_PS(c00, inv_detC); - __m128 den1 = _MM_PROD_PS(c11, inv_detC); + // Avoid zero determinant + detC = srslte_simd_f_select(detC, simd_f_det_min, srslte_simd_f_min(detC, simd_f_det_min)); - __m128 gamma0 = _mm_sub_ps(_mm_rcp_ps(den0), sse_ones); - __m128 gamma1 = _mm_sub_ps(_mm_rcp_ps(den1), sse_ones); + simd_f_t inv_detC = srslte_simd_f_rcp(detC); + inv_detC = srslte_simd_f_mul(simd_f_noise_estimate, inv_detC); - /* Add for averaging */ - __m128 sinr_sse = _mm_add_ps(gamma0, gamma1); - __attribute__((aligned(128))) float sinr[4]; - _mm_store_ps(sinr, sinr_sse); + simd_f_t den0 = srslte_simd_f_mul(srslte_simd_cf_re(c00), inv_detC); + simd_f_t den1 = srslte_simd_f_mul(srslte_simd_cf_re(c11), inv_detC); - sinr_list[i] += sinr[0] + sinr[2]; + simd_f_t gamma0 = srslte_simd_f_sub(srslte_simd_f_rcp(den0), simd_f_ones); + simd_f_t gamma1 = srslte_simd_f_sub(srslte_simd_f_rcp(den1), simd_f_ones); - count += 2; + // Avoid negative gamma + gamma0 = srslte_simd_f_select(gamma0, simd_f_gamma_min, srslte_simd_f_min(gamma0, simd_f_gamma_min)); + gamma1 = srslte_simd_f_select(gamma1, simd_f_gamma_min, srslte_simd_f_min(gamma1, simd_f_gamma_min)); + + simd_f_t gamma_sum = srslte_simd_f_hadd(gamma0, gamma1); + + // Horizontal accumulation + for (int k = 1; k < SRSLTE_SIMD_F_SIZE; k *= 2) { + gamma_sum = srslte_simd_f_hadd(gamma_sum, gamma_sum); + } + + // Temporal store accumulated values + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN / 8))) float v[SRSLTE_SIMD_F_SIZE]; + srslte_simd_f_store(v, gamma_sum); + + // Average and accumulate SINR loop + sinr_acc += (v[0] / SRSLTE_SIMD_CF_SIZE); + + // Increase loop counter + count += 1.0f; } - /* Divide average by noise */ - if (count) { - sinr_list[i] /= count; + // Average loop accumulator + if (isnormal(count)) { + sinr_acc /= count; + } else { + sinr_acc = 1e+9f; } - if (sinr_list[i] > max_sinr) { - max_sinr = sinr_list[i]; + // Set SINR if available + if (sinr_list) { + sinr_list[i] = sinr_acc; + } + + // Set PMI if available + if (pmi && sinr_acc > max_sinr) { + max_sinr = sinr_acc; *pmi = i; } } - return i; + // Return number of codebooks + return 2; } -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_AVX - -int srslte_precoding_pmi_select_2l_avx(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - uint32_t nof_symbols, - float noise_estimate, - uint32_t* pmi, - float sinr_list[SRSLTE_MAX_CODEBOOKS]) -{ - - float max_sinr = 0.0; - uint32_t i, count; - - __m256 avx_noise_estimate = - _mm256_setr_ps(noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f); - __m256 avx_norm = _mm256_set1_ps(0.25f); - __m256 avx_ones = _mm256_set1_ps(1.0f); - - for (i = 0; i < 2; i++) { - sinr_list[i] = 0; - count = 0; - - for (uint32_t j = 0; j < nof_symbols - PMI_SEL_PRECISION * 4 + 1; j += PMI_SEL_PRECISION * 4) { - /* 0. Load channel matrix */ - __m256 h00 = _mm256_setr_ps(crealf(h[0][0][j]), - cimagf(h[0][0][j]), - crealf(h[0][0][j + PMI_SEL_PRECISION]), - cimagf(h[0][0][j + PMI_SEL_PRECISION]), - crealf(h[0][0][j + PMI_SEL_PRECISION * 2]), - cimagf(h[0][0][j + PMI_SEL_PRECISION * 2]), - crealf(h[0][0][j + PMI_SEL_PRECISION * 3]), - cimagf(h[0][0][j + PMI_SEL_PRECISION * 3])); - __m256 h01 = _mm256_setr_ps(crealf(h[1][0][j]), - cimagf(h[1][0][j]), - crealf(h[1][0][j + PMI_SEL_PRECISION]), - cimagf(h[1][0][j + PMI_SEL_PRECISION]), - crealf(h[1][0][j + PMI_SEL_PRECISION * 2]), - cimagf(h[1][0][j + PMI_SEL_PRECISION * 2]), - crealf(h[1][0][j + PMI_SEL_PRECISION * 3]), - cimagf(h[1][0][j + PMI_SEL_PRECISION * 3])); - __m256 h10 = _mm256_setr_ps(crealf(h[0][1][j]), - cimagf(h[0][1][j]), - crealf(h[0][1][j + PMI_SEL_PRECISION]), - cimagf(h[0][1][j + PMI_SEL_PRECISION]), - crealf(h[0][1][j + PMI_SEL_PRECISION * 2]), - cimagf(h[0][1][j + PMI_SEL_PRECISION * 2]), - crealf(h[0][1][j + PMI_SEL_PRECISION * 3]), - cimagf(h[0][1][j + PMI_SEL_PRECISION * 3])); - __m256 h11 = _mm256_setr_ps(crealf(h[1][1][j]), - cimagf(h[1][1][j]), - crealf(h[1][1][j + PMI_SEL_PRECISION]), - cimagf(h[1][1][j + PMI_SEL_PRECISION]), - crealf(h[1][1][j + PMI_SEL_PRECISION * 2]), - cimagf(h[1][1][j + PMI_SEL_PRECISION * 2]), - crealf(h[1][1][j + PMI_SEL_PRECISION * 3]), - cimagf(h[1][1][j + PMI_SEL_PRECISION * 3])); - - /* 1. B = W'* H' */ - __m256 a00, a01, a10, a11; - switch (i) { - case 0: - a00 = _mm256_add_ps(_MM256_CONJ_PS(h00), _MM256_CONJ_PS(h01)); - a01 = _mm256_add_ps(_MM256_CONJ_PS(h10), _MM256_CONJ_PS(h11)); - a10 = _mm256_sub_ps(_MM256_CONJ_PS(h00), _MM256_CONJ_PS(h01)); - a11 = _mm256_sub_ps(_MM256_CONJ_PS(h10), _MM256_CONJ_PS(h11)); - break; - case 1: - a00 = _mm256_sub_ps(_MM256_CONJ_PS(h00), _MM256_MULJ_PS(_MM256_CONJ_PS(h01))); - a01 = _mm256_sub_ps(_MM256_CONJ_PS(h10), _MM256_MULJ_PS(_MM256_CONJ_PS(h11))); - a10 = _mm256_add_ps(_MM256_CONJ_PS(h00), _MM256_MULJ_PS(_MM256_CONJ_PS(h01))); - a11 = _mm256_add_ps(_MM256_CONJ_PS(h10), _MM256_MULJ_PS(_MM256_CONJ_PS(h11))); - break; - default: - return SRSLTE_ERROR; - } - - /* 2. B = W' * H' * H = A * H */ -#ifdef LV_HAVE_FMA - __m256 b00 = _MM256_PROD_ADD_PS(a00, h00, _MM256_PROD_PS(a01, h10)); - __m256 b01 = _MM256_PROD_ADD_PS(a00, h01, _MM256_PROD_PS(a01, h11)); - __m256 b10 = _MM256_PROD_ADD_PS(a10, h00, _MM256_PROD_PS(a11, h10)); - __m256 b11 = _MM256_PROD_ADD_PS(a10, h01, _MM256_PROD_PS(a11, h11)); -#else - __m256 b00 = _mm256_add_ps(_MM256_PROD_PS(a00, h00), _MM256_PROD_PS(a01, h10)); - __m256 b01 = _mm256_add_ps(_MM256_PROD_PS(a00, h01), _MM256_PROD_PS(a01, h11)); - __m256 b10 = _mm256_add_ps(_MM256_PROD_PS(a10, h00), _MM256_PROD_PS(a11, h10)); - __m256 b11 = _mm256_add_ps(_MM256_PROD_PS(a10, h01), _MM256_PROD_PS(a11, h11)); -#endif /* LV_HAVE_FMA */ - - /* 3. C = W' * H' * H * W' = B * W */ - __m256 c00, c01, c10, c11; - switch (i) { - case 0: - c00 = _mm256_add_ps(b00, b01); - c01 = _mm256_sub_ps(b00, b01); - c10 = _mm256_add_ps(b10, b11); - c11 = _mm256_sub_ps(b10, b11); - break; - case 1: - c00 = _mm256_add_ps(b00, _MM256_MULJ_PS(b01)); - c01 = _mm256_sub_ps(b00, _MM256_MULJ_PS(b01)); - c10 = _mm256_add_ps(b10, _MM256_MULJ_PS(b11)); - c11 = _mm256_sub_ps(b10, _MM256_MULJ_PS(b11)); - break; - default: - return SRSLTE_ERROR; - } - c00 = _mm256_mul_ps(c00, avx_norm); - c01 = _mm256_mul_ps(c01, avx_norm); - c10 = _mm256_mul_ps(c10, avx_norm); - c11 = _mm256_mul_ps(c11, avx_norm); - - /* 4. C += noise * I */ - c00 = _mm256_add_ps(c00, avx_noise_estimate); - c11 = _mm256_add_ps(c11, avx_noise_estimate); - - /* 5. detC */ - __m256 detC = srslte_mat_2x2_det_avx(c00, c01, c10, c11); - __m256 inv_detC = srslte_mat_cf_recip_avx(detC); - inv_detC = _mm256_mul_ps(avx_noise_estimate, inv_detC); - - __m256 den0 = _MM256_PROD_PS(c00, inv_detC); - __m256 den1 = _MM256_PROD_PS(c11, inv_detC); - - __m256 gamma0 = _mm256_sub_ps(_mm256_rcp_ps(den0), avx_ones); - __m256 gamma1 = _mm256_sub_ps(_mm256_rcp_ps(den1), avx_ones); - - /* Add for averaging */ - __m256 sinr_avx = _mm256_permute_ps(_mm256_add_ps(gamma0, gamma1), 0b00101000); - __attribute__((aligned(256))) float sinr[8]; - _mm256_store_ps(sinr, sinr_avx); - - sinr_list[i] += sinr[0] + sinr[2] + sinr[4] + sinr[6]; - - count += 4; - } - - /* Divide average by noise */ - if (count) { - sinr_list[i] /= count; - } - - if (sinr_list[i] > max_sinr) { - max_sinr = sinr_list[i]; - *pmi = i; - } - } - - return i; -} - -#endif /* LV_HAVE_AVX */ +#endif /* SRSLTE_SIMD_CF_SIZE */ /* PMI Select for 2 layers */ int srslte_precoding_pmi_select_2l(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], @@ -3002,15 +2769,11 @@ int srslte_precoding_pmi_select_2l(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORT { int ret; -#ifdef LV_HAVE_AVX - ret = srslte_precoding_pmi_select_2l_avx(h, nof_symbols, noise_estimate, pmi, sinr_list); -#else -#ifdef LV_HAVE_SSE - ret = srslte_precoding_pmi_select_2l_sse(h, nof_symbols, noise_estimate, pmi, sinr_list); +#ifdef SRSLTE_SIMD_CF_SIZE + ret = srslte_precoding_pmi_select_2l_simd(h, nof_symbols, noise_estimate, pmi, sinr_list); #else ret = srslte_precoding_pmi_select_2l_gen(h, nof_symbols, noise_estimate, pmi, sinr_list); -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ +#endif /* SRSLTE_SIMD_CF_SIZE */ INFO("Precoder PMI Select for 2 layers SINR=[%.1fdB; %.1fdB] PMI=%d\n", srslte_convert_power_to_dB(sinr_list[0]), @@ -3029,15 +2792,17 @@ int srslte_precoding_pmi_select(cf_t* h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], { int ret; - if (sinr == NULL || pmi == NULL) { - ERROR("Null pointer"); - ret = SRSLTE_ERROR_INVALID_INPUTS; - } else if (nof_layers == 1) { + // Bound noise estimate value + if (!isnormal(noise_estimate) || noise_estimate < 1e-9f) { + noise_estimate = 1e-9f; + } + + if (nof_layers == 1) { ret = srslte_precoding_pmi_select_1l(h, nof_symbols, noise_estimate, pmi, sinr); } else if (nof_layers == 2) { ret = srslte_precoding_pmi_select_2l(h, nof_symbols, noise_estimate, pmi, sinr); } else { - ERROR("Wrong number of layers"); + ERROR("Unsupported number of layers"); ret = SRSLTE_ERROR_INVALID_INPUTS; } diff --git a/lib/src/phy/mimo/test/pmi_select_test.c b/lib/src/phy/mimo/test/pmi_select_test.c index 26dfa00ab..5d273f5b3 100644 --- a/lib/src/phy/mimo/test/pmi_select_test.c +++ b/lib/src/phy/mimo/test/pmi_select_test.c @@ -82,7 +82,14 @@ int main(int argc, char** argv) /* Check SINR for 1 layer */ for (int i = 0; i < ret; i++) { - if (fabsf(gold->snri_1l[i] - sinr_1l[i]) > 0.1) { + float err = fabsf(gold->snri_1l[i] - sinr_1l[i]); + + // Normalise to prevent floating point rounding error + if (gold->snri_1l[i] > 1000.0f) { + err /= gold->snri_1l[i]; + } + + if (err > 0.1f) { ERROR("Test case %d failed computing 1 layer SINR for codebook %d (test=%.2f; gold=%.2f)\n", c + 1, i, @@ -107,7 +114,14 @@ int main(int argc, char** argv) /* Check SINR for 2 layer */ for (int i = 0; i < ret; i++) { - if (fabsf(gold->snri_2l[i] - sinr_2l[i]) > 0.1) { + float err = fabsf(gold->snri_2l[i] - sinr_2l[i]); + + // Normalise to prevent floating point rounding error + if (gold->snri_2l[i] > 1000.0f) { + err /= gold->snri_2l[i]; + } + + if (err > 0.1f) { ERROR("Test case %d failed computing 2 layer SINR for codebook %d (test=%.2f; gold=%.2f)\n", c + 1, i, diff --git a/lib/src/phy/mimo/test/pmi_select_test.h b/lib/src/phy/mimo/test/pmi_select_test.h index bed7433c9..41b9b6cd1 100644 --- a/lib/src/phy/mimo/test/pmi_select_test.h +++ b/lib/src/phy/mimo/test/pmi_select_test.h @@ -22,7 +22,7 @@ #ifndef PMI_SELECT_TEST_H #define PMI_SELECT_TEST_H -#define PMI_SELECT_TEST_NOF_CASES 16 +#define PMI_SELECT_TEST_NOF_CASES 20 #include @@ -215,6 +215,46 @@ static pmi_select_test_case_gold_t pmi_select_test_case_gold[PMI_SELECT_TEST_NOF .ri = 2, .k = 7.7799, }, + { + /* Test case 17 */ + .h = {{1.0f, 0.0f}, {0.0f, 0.0f}}, + .n = 0.0f, + .snri_1l = {5e8f, 5e8f, 5e8f, 5e8f}, + .snri_2l = {0.0f, 0.0f}, + .pmi = {0, 0}, + .ri = 2, + .k = 93.0, + }, + { + /* Test case 18 */ + .h = {{1.0f, 0.0f}, {1.0f, 0.0f}}, + .n = 0.0f, + .snri_1l = {2e9f, 0.0f, 1e9f, 1e9f}, + .snri_2l = {1e9f, 0.0f}, + .pmi = {0, 0}, + .ri = 2, + .k = 96.0, + }, + { + /* Test case 19 */ + .h = {{1.0f, 1.0f}, {1.0f, 1.0f}}, + .n = 0.0f, + .snri_1l = {4e9f, 0.0f, 2e9f, 2e9f}, + .snri_2l = {2e9f, 0.0f}, + .pmi = {0, 0}, + .ri = 1, + .k = 99.0, + }, + { + /* Test case 20 */ + .h = {{1.0f, 0.0f}, {0.0f, 1.0f}}, + .n = 0.0f, + .snri_1l = {1e9f, 1e9f, 1e9f, 1e9f}, + .snri_2l = {1e9f, 1e9f}, + .pmi = {0, 0}, + .ri = 2, + .k = 0.0, + }, }; #endif /* PMI_SELECT_TEST_H */ diff --git a/lib/src/phy/ue/ue_dl.c b/lib/src/phy/ue/ue_dl.c index a1ba9402d..bd6abb067 100644 --- a/lib/src/phy/ue/ue_dl.c +++ b/lib/src/phy/ue/ue_dl.c @@ -578,9 +578,9 @@ static int find_dl_dci_type_crnti(srslte_ue_dl_t* q, dci_blind_search_t search_space; dci_blind_search_t* current_ss = &search_space; - uint32_t sf_idx = sf->tti % 10; - uint32_t cfi = sf->cfi; - srslte_dci_cfg_t dci_cfg = cfg->cfg.dci; + uint32_t sf_idx = sf->tti % 10; + uint32_t cfi = sf->cfi; + srslte_dci_cfg_t dci_cfg = cfg->cfg.dci; // Search first Common SS @@ -606,7 +606,7 @@ static int find_dl_dci_type_crnti(srslte_ue_dl_t* q, } // Search UE-specific search space - dci_cfg = cfg->cfg.dci; + dci_cfg = cfg->cfg.dci; if (q->pregen_rnti == rnti) { current_ss = &q->current_ss_ue[MI_IDX(sf_idx)][cfi - 1][sf_idx]; } else { @@ -740,7 +740,7 @@ static int select_pmi(srslte_ue_dl_t* q, uint32_t ri, uint32_t* pmi, float* sinr /* Set PMI */ if (sinr_db != NULL) { - *sinr_db = srslte_convert_power_to_dB(sinr_list[*pmi % SRSLTE_MAX_CODEBOOKS]); + *sinr_db = srslte_convert_power_to_dB(sinr_list[best_pmi % SRSLTE_MAX_CODEBOOKS]); } } diff --git a/lib/src/phy/utils/mat.c b/lib/src/phy/utils/mat.c index fe8e1262b..e3135d12b 100644 --- a/lib/src/phy/utils/mat.c +++ b/lib/src/phy/utils/mat.c @@ -122,25 +122,32 @@ void srslte_mat_2x2_mmse_gen(cf_t y0, inline float srslte_mat_2x2_cn(cf_t h00, cf_t h01, cf_t h10, cf_t h11) { - /* 1. A = H * H' (A = A') */ + // 1. A = H * H' (A = A') float a00 = crealf(h00) * crealf(h00) + crealf(h01) * crealf(h01) + cimagf(h00) * cimagf(h00) + cimagf(h01) * cimagf(h01); - cf_t a01 = h00 * conjf(h10) + h01 * conjf(h11); - // cf_t a10 = h10*conjf(h00) + h11*conjf(h01) = conjf(a01); + cf_t a01 = h00 * conjf(h10) + h01 * conjf(h11); float a11 = crealf(h10) * crealf(h10) + crealf(h11) * crealf(h11) + cimagf(h10) * cimagf(h10) + cimagf(h11) * cimagf(h11); - /* 2. |H * H' - {λ0, λ1}| = 0 -> aλ² + bλ + c = 0 */ + // 2. |H * H' - {λ0, λ1}| = 0 -> aλ² + bλ + c = 0 float b = a00 + a11; float c = a00 * a11 - (crealf(a01) * crealf(a01) + cimagf(a01) * cimagf(a01)); - /* 3. λ = (-b ± sqrt(b² - 4 * c))/2 */ + // 3. λ = (-b ± sqrt(b² - 4 * c))/2 float sqr = sqrtf(b * b - 4.0f * c); float xmax = b + sqr; float xmin = b - sqr; - /* 4. κ = sqrt(λ_max / λ_min) */ - return 10 * log10f(xmax / xmin); + // 4. Bound xmin and xmax + if (!isnormal(xmin) || xmin < 1e-9) { + xmin = 1e-9; + } + if (!isnormal(xmax) || xmax > 1e+9) { + xmax = 1e+9; + } + + // 5. κ = sqrt(λ_max / λ_min) + return 10.0f * log10f(xmax / xmin); } #ifdef LV_HAVE_SSE diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index c9849dcb6..7727b2c39 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -27,6 +27,7 @@ #include "srslte/phy/utils/bit.h" #include "srslte/phy/utils/debug.h" +#include "srslte/phy/utils/simd.h" #include "srslte/phy/utils/vector.h" #include "srslte/phy/utils/vector_simd.h"