From 7146819fcd6eedff800a1993dcbfea54ff06da4d Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Fri, 20 Apr 2018 11:17:33 +0200 Subject: [PATCH] Added CSI Predecoding for TM4 with SIMD Support --- lib/src/phy/mimo/precoding.c | 916 +++++++++++++++++++---------------- 1 file changed, 488 insertions(+), 428 deletions(-) diff --git a/lib/src/phy/mimo/precoding.c b/lib/src/phy/mimo/precoding.c index db0938696..dbf34ab90 100644 --- a/lib/src/phy/mimo/precoding.c +++ b/lib/src/phy/mimo/precoding.c @@ -1118,14 +1118,17 @@ int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS], return SRSLTE_ERROR; } -#ifdef LV_HAVE_AVX +static int srslte_predecoding_multiplex_2x2_zf_csi(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi, + int codebook_idx, + int nof_symbols, + float scaling) { + float norm = 1.0f; + int i = 0; -// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer -int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { - float norm = 1.0; - - switch(codebook_idx) { + switch (codebook_idx) { case 0: norm = (float) M_SQRT2 / scaling; break; @@ -1138,145 +1141,57 @@ int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S return SRSLTE_ERROR; } - for (int i = 0; i < nof_symbols - 3; i += 4) { - __m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i])); - __m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i])); - __m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i])); - __m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i])); +#if SRSLTE_SIMD_CF_SIZE != 0 + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); - __m256 h00, h01, h10, h11; + simd_cf_t h00, h01, h10, h11; switch (codebook_idx) { case 0: - h00 = _h00; - h01 = _h10; - h10 = _h01; - h11 = _h11; + h00 = h00i; + h01 = h10i; + h10 = h01i; + h11 = h11i; break; case 1: - h00 = _mm256_add_ps(_h00, _h10); - h01 = _mm256_sub_ps(_h00, _h10); - h10 = _mm256_add_ps(_h01, _h11); - h11 = _mm256_sub_ps(_h01, _h11); + h00 = srslte_simd_cf_add(h00i, h10i); + h01 = srslte_simd_cf_sub(h00i, h10i); + h10 = srslte_simd_cf_add(h01i, h11i); + h11 = srslte_simd_cf_sub(h01i, h11i); break; case 2: - h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10)); - h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10)); - h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11)); - h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11)); - break; - default: - DEBUG("Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - __m256 y0 = _mm256_load_ps((float *) &y[0][i]); - __m256 y1 = _mm256_load_ps((float *) &y[1][i]); - - __m256 x0, x1; - - srslte_mat_2x2_zf_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, norm); - - _mm256_store_ps((float *) &x[0][i], x0); - _mm256_store_ps((float *) &x[1][i], x1); - - } - - return SRSLTE_SUCCESS; -} - -#endif /* LV_HAVE_AVX */ - -#ifdef LV_HAVE_SSE - -// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer -int srslte_predecoding_multiplex_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { - float norm = 1.0; - - switch(codebook_idx) { - case 0: - norm = (float) M_SQRT2 / scaling; - break; - case 1: - case 2: - norm = 2.0f / scaling; - break; - default: - ERROR("Wrong codebook_idx=%d", codebook_idx); - return SRSLTE_ERROR; - } - - for (int i = 0; i < nof_symbols - 1; i += 2) { - __m128 _h00 = _mm_load_ps((float*)&(h[0][0][i])); - __m128 _h01 = _mm_load_ps((float*)&(h[0][1][i])); - __m128 _h10 = _mm_load_ps((float*)&(h[1][0][i])); - __m128 _h11 = _mm_load_ps((float*)&(h[1][1][i])); - - __m128 h00, h01, h10, h11; - switch (codebook_idx) { - case 0: - h00 = _h00; - h01 = _h10; - h10 = _h01; - h11 = _h11; - break; - case 1: - h00 = _mm_add_ps(_h00, _h10); - h01 = _mm_sub_ps(_h00, _h10); - h10 = _mm_add_ps(_h01, _h11); - h11 = _mm_sub_ps(_h01, _h11); - break; - case 2: - h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10)); - h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10)); - h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11)); - h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11)); + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i)); + h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i)); + h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i)); break; default: fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); return SRSLTE_ERROR; } - __m128 y0 = _mm_load_ps((float *) &y[0][i]); - __m128 y1 = _mm_load_ps((float *) &y[1][i]); + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); - __m128 x0, x1; + simd_cf_t x0, x1; + simd_f_t csi0, csi1; + srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm); - srslte_mat_2x2_zf_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, norm); - - _mm_store_ps((float *) &x[0][i], x0); - _mm_store_ps((float *) &x[1][i], x1); + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); + srslte_simd_f_store(&csi[i], csi0); + srslte_simd_f_store(&csi[i], csi1); } +#endif /* SRSLTE_SIMD_CF_SIZE */ - return SRSLTE_SUCCESS; -} + for (; i < nof_symbols; i++) { + cf_t h00, h01, h10, h11; -#endif /* LV_HAVE_SSE */ - - -// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer -int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { - float norm = 1.0; - - switch(codebook_idx) { - case 0: - norm = (float) M_SQRT2 / scaling; - break; - case 1: - case 2: - norm = 2.0f / scaling; - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - for (int i = 0; i < nof_symbols; i++) { - cf_t h00, h01, h10, h11, det; - - switch(codebook_idx) { + switch (codebook_idx) { case 0: h00 = h[0][0][i]; h01 = h[1][0][i]; @@ -1290,168 +1205,252 @@ int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S h11 = h[0][1][i] - h[1][1][i]; break; case 2: - h00 = h[0][0][i] + _Complex_I*h[1][0][i]; - h01 = h[0][0][i] - _Complex_I*h[1][0][i]; - h10 = h[0][1][i] + _Complex_I*h[1][1][i]; - h11 = h[0][1][i] - _Complex_I*h[1][1][i]; + h00 = h[0][0][i] + _Complex_I * h[1][0][i]; + h01 = h[0][0][i] - _Complex_I * h[1][0][i]; + h10 = h[0][1][i] + _Complex_I * h[1][1][i]; + h11 = h[0][1][i] - _Complex_I * h[1][1][i]; break; default: fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); return SRSLTE_ERROR; } - det = (h00 * h11 - h01 * h10); + cf_t det = (h00 * h11 - h01 * h10); det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det))); x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det; x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det; + + csi[i] = 1.0f; + csi[i] = 1.0f; } return SRSLTE_SUCCESS; } -#ifdef LV_HAVE_AVX - -// AVX implementation of ZF 2x2 Spatial Multiplexity equalizer -int srslte_predecoding_multiplex_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, - float scaling, float noise_estimate) { - float norm = 1.0; - - switch(codebook_idx) { - case 0: - norm = (float) M_SQRT2 / scaling; - break; - case 1: - case 2: - norm = 2.0f / scaling; - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - for (int i = 0; i < nof_symbols; i += 4) { - __m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i])); - __m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i])); - __m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i])); - __m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i])); - - __m256 h00, h01, h10, h11; - switch (codebook_idx) { - case 0: - h00 = _h00; - h01 = _h10; - h10 = _h01; - h11 = _h11; - break; - case 1: - h00 = _mm256_add_ps(_h00, _h10); - h01 = _mm256_sub_ps(_h00, _h10); - h10 = _mm256_add_ps(_h01, _h11); - h11 = _mm256_sub_ps(_h01, _h11); - break; - case 2: - h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10)); - h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10)); - h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11)); - h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11)); - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - __m256 y0 = _mm256_load_ps((float *) &y[0][i]); - __m256 y1 = _mm256_load_ps((float *) &y[1][i]); - - __m256 x0, x1; - - srslte_mat_2x2_mmse_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm); - - _mm256_store_ps((float *) &x[0][i], x0); - _mm256_store_ps((float *) &x[1][i], x1); - - } - - return SRSLTE_SUCCESS; -} - -#endif /* LV_HAVE_AVX */ - - -#ifdef LV_HAVE_SSE - -// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer -int srslte_predecoding_multiplex_2x2_mmse_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, - float scaling, float noise_estimate) { - float norm; - - switch(codebook_idx) { - case 0: - norm = (float) M_SQRT2 / scaling; - break; - case 1: - case 2: - norm = 2.0f / scaling; - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - for (int i = 0; i < nof_symbols - 1; i += 2) { - __m128 _h00 = _mm_load_ps((float*)&(h[0][0][i])); - __m128 _h01 = _mm_load_ps((float*)&(h[0][1][i])); - __m128 _h10 = _mm_load_ps((float*)&(h[1][0][i])); - __m128 _h11 = _mm_load_ps((float*)&(h[1][1][i])); - - __m128 h00, h01, h10, h11; - switch (codebook_idx) { - case 0: - h00 = _h00; - h01 = _h10; - h10 = _h01; - h11 = _h11; - break; - case 1: - h00 = _mm_add_ps(_h00, _h10); - h01 = _mm_sub_ps(_h00, _h10); - h10 = _mm_add_ps(_h01, _h11); - h11 = _mm_sub_ps(_h01, _h11); - break; - case 2: - h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10)); - h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10)); - h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11)); - h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11)); - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - __m128 y0 = _mm_load_ps((float *) &y[0][i]); - __m128 y1 = _mm_load_ps((float *) &y[1][i]); - - __m128 x0, x1; - - srslte_mat_2x2_mmse_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm); - - _mm_store_ps((float *) &x[0][i], x0); - _mm_store_ps((float *) &x[1][i], x1); - - } - - return SRSLTE_SUCCESS; -} -#endif /* LV_HAVE_SSE */ - // Generic implementation of ZF 2x2 Spatial Multiplexity equalizer -int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, - float scaling, float noise_estimate) { +static int srslte_predecoding_multiplex_2x2_zf(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + int codebook_idx, + int nof_symbols, + float scaling) { + float norm = 1.0f; + int i = 0; + + switch (codebook_idx) { + case 0: + norm = (float) M_SQRT2 / scaling; + break; + case 1: + case 2: + norm = 2.0f / scaling; + break; + default: + ERROR("Wrong codebook_idx=%d", codebook_idx); + return SRSLTE_ERROR; + } + +#if SRSLTE_SIMD_CF_SIZE != 0 + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); + + simd_cf_t h00, h01, h10, h11; + switch (codebook_idx) { + case 0: + h00 = h00i; + h01 = h10i; + h10 = h01i; + h11 = h11i; + break; + case 1: + h00 = srslte_simd_cf_add(h00i, h10i); + h01 = srslte_simd_cf_sub(h00i, h10i); + h10 = srslte_simd_cf_add(h01i, h11i); + h11 = srslte_simd_cf_sub(h01i, h11i); + break; + case 2: + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i)); + h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i)); + h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i)); + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); + + simd_cf_t x0, x1; + simd_f_t csi0, csi1; + srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm); + + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); + + } +#endif /* SRSLTE_SIMD_CF_SIZE */ + + for (; i < nof_symbols; i++) { + cf_t h00, h01, h10, h11; + + switch (codebook_idx) { + case 0: + h00 = h[0][0][i]; + h01 = h[1][0][i]; + h10 = h[0][1][i]; + h11 = h[1][1][i]; + break; + case 1: + h00 = h[0][0][i] + h[1][0][i]; + h01 = h[0][0][i] - h[1][0][i]; + h10 = h[0][1][i] + h[1][1][i]; + h11 = h[0][1][i] - h[1][1][i]; + break; + case 2: + h00 = h[0][0][i] + _Complex_I * h[1][0][i]; + h01 = h[0][0][i] - _Complex_I * h[1][0][i]; + h10 = h[0][1][i] + _Complex_I * h[1][1][i]; + h11 = h[0][1][i] - _Complex_I * h[1][1][i]; + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], norm); + } + return SRSLTE_SUCCESS; +} + +// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer +static int srslte_predecoding_multiplex_2x2_mmse_csi(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_CODEWORDS], + int codebook_idx, + int nof_symbols, + float scaling, + float noise_estimate) { + float norm = 1.0f; + int i = 0; + + switch (codebook_idx) { + case 0: + norm = (float) M_SQRT2 / scaling; + break; + case 1: + case 2: + norm = 2.0f / scaling; + break; + default: + ERROR("Wrong codebook_idx=%d", codebook_idx); + return SRSLTE_ERROR; + } + +#if SRSLTE_SIMD_CF_SIZE != 0 + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); + + simd_cf_t h00, h01, h10, h11; + switch (codebook_idx) { + case 0: + h00 = h00i; + h01 = h10i; + h10 = h01i; + h11 = h11i; + break; + case 1: + h00 = srslte_simd_cf_add(h00i, h10i); + h01 = srslte_simd_cf_sub(h00i, h10i); + h10 = srslte_simd_cf_add(h01i, h11i); + h11 = srslte_simd_cf_sub(h01i, h11i); + break; + case 2: + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i)); + h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i)); + h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i)); + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); + + simd_cf_t x0, x1; + simd_f_t csi0, csi1; + srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm); + + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); + + srslte_simd_f_store(&csi[0][i], csi0); + srslte_simd_f_store(&csi[1][i], csi1); + } +#endif /* SRSLTE_SIMD_CF_SIZE */ + + for (; i < nof_symbols; i++) { + cf_t h00, h01, h10, h11; + + switch (codebook_idx) { + case 0: + h00 = h[0][0][i]; + h01 = h[1][0][i]; + h10 = h[0][1][i]; + h11 = h[1][1][i]; + break; + case 1: + h00 = h[0][0][i] + h[1][0][i]; + h01 = h[0][0][i] - h[1][0][i]; + h10 = h[0][1][i] + h[1][1][i]; + h11 = h[0][1][i] - h[1][1][i]; + break; + case 2: + h00 = h[0][0][i] + _Complex_I * h[1][0][i]; + h01 = h[0][0][i] - _Complex_I * h[1][0][i]; + h10 = h[0][1][i] + _Complex_I * h[1][1][i]; + h11 = h[0][1][i] - _Complex_I * h[1][1][i]; + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + srslte_mat_2x2_mmse_csi_gen(y[0][i], + y[1][i], + h00, + h01, + h10, + h11, + &x[0][i], + &x[1][i], + &csi[0][i], + &csi[1][i], + noise_estimate, + norm); + } + return SRSLTE_SUCCESS; +} + +static int srslte_predecoding_multiplex_2x2_mmse(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + int codebook_idx, + int nof_symbols, + float scaling, + float noise_estimate) { float norm = 1.0; + int i = 0; switch(codebook_idx) { case 0: @@ -1466,7 +1465,51 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h return SRSLTE_ERROR; } - for (int i = 0; i < nof_symbols; i++) { +#if SRSLTE_SIMD_CF_SIZE != 0 + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); + + simd_cf_t h00, h01, h10, h11; + switch(codebook_idx) { + case 0: + h00 = h00i; + h01 = h10i; + h10 = h01i; + h11 = h11i; + break; + case 1: + h00 = srslte_simd_cf_add(h00i, h10i); + h01 = srslte_simd_cf_sub(h00i, h10i); + h10 = srslte_simd_cf_add(h01i, h11i); + h11 = srslte_simd_cf_sub(h01i, h11i); + break; + case 2: + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i)); + h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i)); + h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i)); + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); + + simd_cf_t x0, x1; + simd_f_t csi0, csi1; + srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm); + + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); + } +#endif /* SRSLTE_SIMD_CF_SIZE */ + + for (; i < nof_symbols; i++) { cf_t h00, h01, h10, h11; switch(codebook_idx) { @@ -1498,134 +1541,62 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h return SRSLTE_SUCCESS; } -#ifdef LV_HAVE_AVX -// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer -int srslte_predecoding_multiplex_2x1_mrc_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { - - for (int i = 0; i < nof_symbols - 3; i += 4) { - __m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i])); - __m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i])); - __m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i])); - __m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i])); - - __m256 h0, h1; - switch (codebook_idx) { - case 0: - h0 = _mm256_add_ps(_h00, _h10); - h1 = _mm256_add_ps(_h01, _h11); - break; - case 1: - h0 = _mm256_sub_ps(_h00, _h10); - h1 = _mm256_sub_ps(_h01, _h11); - break; - case 2: - h0 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10)); - h1 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11)); - break; - case 3: - h0 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10)); - h1 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11)); - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - __m256 h0_2 = _mm256_mul_ps(h0, h0); - __m256 h1_2 = _mm256_mul_ps(h1, h1); - __m256 hh0 = _mm256_add_ps(_mm256_movehdup_ps(h0_2), _mm256_moveldup_ps(h0_2)); - __m256 hh1 = _mm256_add_ps(_mm256_movehdup_ps(h1_2), _mm256_moveldup_ps(h1_2)); - __m256 hh = _mm256_add_ps(hh0, hh1); - __m256 hhrec = _mm256_rcp_ps(hh); - - hhrec = _mm256_mul_ps(hhrec, _mm256_set1_ps((float) M_SQRT2 / scaling)); - __m256 y0 = _mm256_load_ps((float*)&y[0][i]); - __m256 y1 = _mm256_load_ps((float*)&y[1][i]); - - __m256 x0 = _mm256_add_ps(_MM256_PROD_PS(_MM256_CONJ_PS(h0), y0), _MM256_PROD_PS(_MM256_CONJ_PS(h1), y1)); - x0 = _mm256_mul_ps(hhrec, x0); - - _mm256_store_ps((float*)&x[0][i], x0); - - } - - return SRSLTE_SUCCESS; -} - -#endif /* LV_HAVE_AVX */ - - -// SSE implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer -#ifdef LV_HAVE_SSE - -int srslte_predecoding_multiplex_2x1_mrc_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { - - for (int i = 0; i < nof_symbols - 1; i += 2) { - __m128 _h00 = _mm_load_ps((float*)&(h[0][0][i])); - __m128 _h01 = _mm_load_ps((float*)&(h[0][1][i])); - __m128 _h10 = _mm_load_ps((float*)&(h[1][0][i])); - __m128 _h11 = _mm_load_ps((float*)&(h[1][1][i])); - - __m128 h0, h1; - switch (codebook_idx) { - case 0: - h0 = _mm_add_ps(_h00, _h10); - h1 = _mm_add_ps(_h01, _h11); - break; - case 1: - h0 = _mm_sub_ps(_h00, _h10); - h1 = _mm_sub_ps(_h01, _h11); - break; - case 2: - h0 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10)); - h1 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11)); - break; - case 3: - h0 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10)); - h1 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11)); - break; - default: - fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); - return SRSLTE_ERROR; - } - - __m128 h0_2 = _mm_mul_ps(h0, h0); - __m128 h1_2 = _mm_mul_ps(h1, h1); - __m128 hh0 = _mm_add_ps(_mm_movehdup_ps(h0_2), _mm_moveldup_ps(h0_2)); - __m128 hh1 = _mm_add_ps(_mm_movehdup_ps(h1_2), _mm_moveldup_ps(h1_2)); - __m128 hh = _mm_add_ps(hh0, hh1); - __m128 hhrec = _mm_rcp_ps(hh); - - hhrec = _mm_mul_ps(hhrec, _mm_set1_ps((float) M_SQRT2 / scaling)); - - __m128 y0 = _mm_load_ps((float*)&y[0][i]); - __m128 y1 = _mm_load_ps((float*)&y[1][i]); - - __m128 x0 = _mm_add_ps(_MM_PROD_PS(_MM_CONJ_PS(h0), y0), _MM_PROD_PS(_MM_CONJ_PS(h1), y1)); - x0 = _mm_mul_ps(hhrec, x0); - - _mm_store_ps((float*)&x[0][i], x0); - - } - - return SRSLTE_SUCCESS; -} - -#endif /* LV_HAVE_SSE */ - -// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer -int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { +// Implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer +static int srslte_predecoding_multiplex_2x1_mrc(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + int codebook_idx, + int nof_symbols, + float scaling) { float norm = (float) M_SQRT2 / scaling; + int i = 0; - for (int i = 0; i < nof_symbols; i += 1) { +#if SRSLTE_SIMD_CF_SIZE != 0 + simd_f_t _norm = srslte_simd_f_set1(norm); + + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t x0 = srslte_simd_cf_set1(0.0f); + simd_f_t hh = srslte_simd_f_set1(0.0f); + + for (int k = 0; k < 2; k++) { + simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]); + simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]); + simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]); + + simd_cf_t hx; + switch (codebook_idx) { + case 0: + hx = srslte_simd_cf_add(h0xi, h1xi); + break; + case 1: + hx = srslte_simd_cf_sub(h0xi, h1xi); + break; + case 2: + hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi)); + break; + case 3: + hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi)); + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh); + x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0); + } + + hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh)); + srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh)); + } +#endif /* SRSLTE_SIMD_CF_SIZE */ + + for (; i < nof_symbols; i += 1) { cf_t h0, h1; float hh; - switch(codebook_idx) { + switch (codebook_idx) { case 0: h0 = h[0][0][i] + h[1][0][i]; h1 = h[0][1][i] + h[1][1][i]; @@ -1654,46 +1625,135 @@ int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[ return SRSLTE_SUCCESS; } -int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], - int nof_rxant, int nof_ports, int nof_layers, int codebook_idx, int nof_symbols, - float scaling, float noise_estimate) -{ +// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer +static int srslte_predecoding_multiplex_2x1_mrc_csi(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi, + int codebook_idx, + int nof_symbols, + float scaling) { + float norm = (float) M_SQRT2 / scaling; + int i = 0; + +#if SRSLTE_SIMD_CF_SIZE != 0 + simd_f_t _norm = srslte_simd_f_set1(norm); + + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t x0 = srslte_simd_cf_set1(0.0f); + simd_f_t hh = srslte_simd_f_set1(0.0f); + + for (int k = 0; k < 2; k++) { + simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]); + simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]); + simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]); + + simd_cf_t hx; + switch (codebook_idx) { + case 0: + hx = srslte_simd_cf_add(h0xi, h1xi); + break; + case 1: + hx = srslte_simd_cf_sub(h0xi, h1xi); + break; + case 2: + hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi)); + break; + case 3: + hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi)); + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh); + x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0); + } + + hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh)); + srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh)); + srslte_simd_f_store(&csi[i], srslte_simd_f_mul(srslte_simd_f_rcp(hh), srslte_simd_f_set1((float) M_SQRT1_2))); + } +#endif /* SRSLTE_SIMD_CF_SIZE */ + + for (; i < nof_symbols; i += 1) { + cf_t h0, h1; + float hh, _csi; + + switch (codebook_idx) { + case 0: + h0 = h[0][0][i] + h[1][0][i]; + h1 = h[0][1][i] + h[1][1][i]; + break; + case 1: + h0 = h[0][0][i] - h[1][0][i]; + h1 = h[0][1][i] - h[1][1][i]; + break; + case 2: + h0 = h[0][0][i] + _Complex_I * h[1][0][i]; + h1 = h[0][1][i] + _Complex_I * h[1][1][i]; + break; + case 3: + h0 = h[0][0][i] - _Complex_I * h[1][0][i]; + h1 = h[0][1][i] - _Complex_I * h[1][1][i]; + break; + default: + fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); + return SRSLTE_ERROR; + } + + _csi = crealf(h0) * crealf(h0) + cimagf(h0) * cimagf(h0) + crealf(h1) * crealf(h1) + cimagf(h1) * cimagf(h1); + hh = norm / _csi; + + x[0][i] = (conjf(h0) * y[0][i] + conjf(h1) * y[1][i]) * hh; + csi[i] = _csi / norm * (float) M_SQRT1_2; + } + return SRSLTE_SUCCESS; +} + +static int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_CODEWORDS], + int nof_rxant, + int nof_ports, + int nof_layers, + int codebook_idx, + int nof_symbols, + float scaling, + float noise_estimate) { if (nof_ports == 2 && nof_rxant <= 2) { if (nof_layers == 2) { switch (mimo_decoder) { case SRSLTE_MIMO_DECODER_ZF: -#ifdef LV_HAVE_AVX - return srslte_predecoding_multiplex_2x2_zf_avx(y, h, x, codebook_idx, nof_symbols, scaling); -#else -#ifdef LV_HAVE_SSE - return srslte_predecoding_multiplex_2x2_zf_sse(y, h, x, codebook_idx, nof_symbols, scaling); -#else - return srslte_predecoding_multiplex_2x2_zf_gen(y, h, x, codebook_idx, nof_symbols, scaling); -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ + if (csi && csi[0]) { + return srslte_predecoding_multiplex_2x2_zf_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling); + } else { + return srslte_predecoding_multiplex_2x2_zf(y, h, x, codebook_idx, nof_symbols, scaling); + } break; case SRSLTE_MIMO_DECODER_MMSE: -#ifdef LV_HAVE_AVX - return srslte_predecoding_multiplex_2x2_mmse_avx(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); -#else -#ifdef LV_HAVE_SSE - return srslte_predecoding_multiplex_2x2_mmse_sse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); -#else - return srslte_predecoding_multiplex_2x2_mmse_gen(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ + if (csi && csi[0]) { + return srslte_predecoding_multiplex_2x2_mmse_csi(y, + h, + x, + csi, + codebook_idx, + nof_symbols, + scaling, + noise_estimate); + } else { + return srslte_predecoding_multiplex_2x2_mmse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); + } break; } } else { -#ifdef LV_HAVE_AVX - return srslte_predecoding_multiplex_2x1_mrc_avx(y, h, x, codebook_idx, nof_symbols, scaling); -#else -#ifdef LV_HAVE_SSE - return srslte_predecoding_multiplex_2x1_mrc_sse(y, h, x, codebook_idx, nof_symbols, scaling); -#else - return srslte_predecoding_multiplex_2x1_mrc_gen(y, h, x, codebook_idx, nof_symbols, scaling); -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ + if (csi && csi[0]) { + return srslte_predecoding_multiplex_2x1_mrc_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling); + } else { + return srslte_predecoding_multiplex_2x1_mrc(y, h, x, codebook_idx, nof_symbols, scaling); + } } } else if (nof_ports == 4) { DEBUG("Error predecoding multiplex: not implemented for %d Tx ports", nof_ports); @@ -1759,7 +1819,7 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS] } break; case SRSLTE_MIMO_TYPE_SPATIAL_MULTIPLEX: - return srslte_predecoding_multiplex(y, h, x, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols, + return srslte_predecoding_multiplex(y, h, x, csi, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols, scaling, noise_estimate); default: return SRSLTE_ERROR;