Added CSI Predecoding for TM4 with SIMD Support

This commit is contained in:
Xavier Arteaga 2018-04-20 11:17:33 +02:00
parent bad1291843
commit 7146819fcd
1 changed files with 488 additions and 428 deletions

View File

@ -1118,12 +1118,15 @@ int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS],
return SRSLTE_ERROR;
}
#ifdef LV_HAVE_AVX
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
float norm = 1.0;
static int srslte_predecoding_multiplex_2x2_zf_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi,
int codebook_idx,
int nof_symbols,
float scaling) {
float norm = 1.0f;
int i = 0;
switch (codebook_idx) {
case 0:
@ -1138,143 +1141,55 @@ int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
return SRSLTE_ERROR;
}
for (int i = 0; i < nof_symbols - 3; i += 4) {
__m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i]));
__m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i]));
__m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i]));
__m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i]));
#if SRSLTE_SIMD_CF_SIZE != 0
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
__m256 h00, h01, h10, h11;
simd_cf_t h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = _h00;
h01 = _h10;
h10 = _h01;
h11 = _h11;
h00 = h00i;
h01 = h10i;
h10 = h01i;
h11 = h11i;
break;
case 1:
h00 = _mm256_add_ps(_h00, _h10);
h01 = _mm256_sub_ps(_h00, _h10);
h10 = _mm256_add_ps(_h01, _h11);
h11 = _mm256_sub_ps(_h01, _h11);
h00 = srslte_simd_cf_add(h00i, h10i);
h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = srslte_simd_cf_add(h01i, h11i);
h11 = srslte_simd_cf_sub(h01i, h11i);
break;
case 2:
h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
break;
default:
DEBUG("Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m256 y0 = _mm256_load_ps((float *) &y[0][i]);
__m256 y1 = _mm256_load_ps((float *) &y[1][i]);
__m256 x0, x1;
srslte_mat_2x2_zf_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, norm);
_mm256_store_ps((float *) &x[0][i], x0);
_mm256_store_ps((float *) &x[1][i], x1);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer
int srslte_predecoding_multiplex_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
float norm = 1.0;
switch(codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
ERROR("Wrong codebook_idx=%d", codebook_idx);
return SRSLTE_ERROR;
}
for (int i = 0; i < nof_symbols - 1; i += 2) {
__m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
__m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
__m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
__m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
__m128 h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = _h00;
h01 = _h10;
h10 = _h01;
h11 = _h11;
break;
case 1:
h00 = _mm_add_ps(_h00, _h10);
h01 = _mm_sub_ps(_h00, _h10);
h10 = _mm_add_ps(_h01, _h11);
h11 = _mm_sub_ps(_h01, _h11);
break;
case 2:
h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10));
h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10));
h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11));
h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11));
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m128 y0 = _mm_load_ps((float *) &y[0][i]);
__m128 y1 = _mm_load_ps((float *) &y[1][i]);
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
__m128 x0, x1;
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);
srslte_mat_2x2_zf_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, norm);
_mm_store_ps((float *) &x[0][i], x0);
_mm_store_ps((float *) &x[1][i], x1);
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
srslte_simd_f_store(&csi[i], csi0);
srslte_simd_f_store(&csi[i], csi1);
}
#endif /* SRSLTE_SIMD_CF_SIZE */
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_SSE */
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
float norm = 1.0;
switch(codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
for (int i = 0; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11, det;
for (; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
@ -1300,158 +1215,27 @@ int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
return SRSLTE_ERROR;
}
det = (h00 * h11 - h01 * h10);
cf_t det = (h00 * h11 - h01 * h10);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
csi[i] = 1.0f;
csi[i] = 1.0f;
}
return SRSLTE_SUCCESS;
}
#ifdef LV_HAVE_AVX
// AVX implementation of ZF 2x2 Spatial Multiplexity equalizer
int srslte_predecoding_multiplex_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
float scaling, float noise_estimate) {
float norm = 1.0;
switch(codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
for (int i = 0; i < nof_symbols; i += 4) {
__m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i]));
__m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i]));
__m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i]));
__m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i]));
__m256 h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = _h00;
h01 = _h10;
h10 = _h01;
h11 = _h11;
break;
case 1:
h00 = _mm256_add_ps(_h00, _h10);
h01 = _mm256_sub_ps(_h00, _h10);
h10 = _mm256_add_ps(_h01, _h11);
h11 = _mm256_sub_ps(_h01, _h11);
break;
case 2:
h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m256 y0 = _mm256_load_ps((float *) &y[0][i]);
__m256 y1 = _mm256_load_ps((float *) &y[1][i]);
__m256 x0, x1;
srslte_mat_2x2_mmse_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm);
_mm256_store_ps((float *) &x[0][i], x0);
_mm256_store_ps((float *) &x[1][i], x1);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE
// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer
int srslte_predecoding_multiplex_2x2_mmse_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
float scaling, float noise_estimate) {
float norm;
switch(codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
for (int i = 0; i < nof_symbols - 1; i += 2) {
__m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
__m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
__m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
__m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
__m128 h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = _h00;
h01 = _h10;
h10 = _h01;
h11 = _h11;
break;
case 1:
h00 = _mm_add_ps(_h00, _h10);
h01 = _mm_sub_ps(_h00, _h10);
h10 = _mm_add_ps(_h01, _h11);
h11 = _mm_sub_ps(_h01, _h11);
break;
case 2:
h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10));
h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10));
h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11));
h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m128 y0 = _mm_load_ps((float *) &y[0][i]);
__m128 y1 = _mm_load_ps((float *) &y[1][i]);
__m128 x0, x1;
srslte_mat_2x2_mmse_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm);
_mm_store_ps((float *) &x[0][i], x0);
_mm_store_ps((float *) &x[1][i], x1);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_SSE */
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
float scaling, float noise_estimate) {
float norm = 1.0;
static int srslte_predecoding_multiplex_2x2_zf(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
int codebook_idx,
int nof_symbols,
float scaling) {
float norm = 1.0f;
int i = 0;
switch (codebook_idx) {
case 0:
@ -1466,7 +1250,266 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
return SRSLTE_ERROR;
}
for (int i = 0; i < nof_symbols; i++) {
#if SRSLTE_SIMD_CF_SIZE != 0
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
simd_cf_t h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = h00i;
h01 = h10i;
h10 = h01i;
h11 = h11i;
break;
case 1:
h00 = srslte_simd_cf_add(h00i, h10i);
h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = srslte_simd_cf_add(h01i, h11i);
h11 = srslte_simd_cf_sub(h01i, h11i);
break;
case 2:
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
}
#endif /* SRSLTE_SIMD_CF_SIZE */
for (; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = h[0][0][i];
h01 = h[1][0][i];
h10 = h[0][1][i];
h11 = h[1][1][i];
break;
case 1:
h00 = h[0][0][i] + h[1][0][i];
h01 = h[0][0][i] - h[1][0][i];
h10 = h[0][1][i] + h[1][1][i];
h11 = h[0][1][i] - h[1][1][i];
break;
case 2:
h00 = h[0][0][i] + _Complex_I * h[1][0][i];
h01 = h[0][0][i] - _Complex_I * h[1][0][i];
h10 = h[0][1][i] + _Complex_I * h[1][1][i];
h11 = h[0][1][i] - _Complex_I * h[1][1][i];
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], norm);
}
return SRSLTE_SUCCESS;
}
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
static int srslte_predecoding_multiplex_2x2_mmse_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi[SRSLTE_MAX_CODEWORDS],
int codebook_idx,
int nof_symbols,
float scaling,
float noise_estimate) {
float norm = 1.0f;
int i = 0;
switch (codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
ERROR("Wrong codebook_idx=%d", codebook_idx);
return SRSLTE_ERROR;
}
#if SRSLTE_SIMD_CF_SIZE != 0
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
simd_cf_t h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = h00i;
h01 = h10i;
h10 = h01i;
h11 = h11i;
break;
case 1:
h00 = srslte_simd_cf_add(h00i, h10i);
h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = srslte_simd_cf_add(h01i, h11i);
h11 = srslte_simd_cf_sub(h01i, h11i);
break;
case 2:
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
srslte_simd_f_store(&csi[0][i], csi0);
srslte_simd_f_store(&csi[1][i], csi1);
}
#endif /* SRSLTE_SIMD_CF_SIZE */
for (; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11;
switch (codebook_idx) {
case 0:
h00 = h[0][0][i];
h01 = h[1][0][i];
h10 = h[0][1][i];
h11 = h[1][1][i];
break;
case 1:
h00 = h[0][0][i] + h[1][0][i];
h01 = h[0][0][i] - h[1][0][i];
h10 = h[0][1][i] + h[1][1][i];
h11 = h[0][1][i] - h[1][1][i];
break;
case 2:
h00 = h[0][0][i] + _Complex_I * h[1][0][i];
h01 = h[0][0][i] - _Complex_I * h[1][0][i];
h10 = h[0][1][i] + _Complex_I * h[1][1][i];
h11 = h[0][1][i] - _Complex_I * h[1][1][i];
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
srslte_mat_2x2_mmse_csi_gen(y[0][i],
y[1][i],
h00,
h01,
h10,
h11,
&x[0][i],
&x[1][i],
&csi[0][i],
&csi[1][i],
noise_estimate,
norm);
}
return SRSLTE_SUCCESS;
}
static int srslte_predecoding_multiplex_2x2_mmse(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
int codebook_idx,
int nof_symbols,
float scaling,
float noise_estimate) {
float norm = 1.0;
int i = 0;
switch(codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
ERROR("Wrong codebook_idx=%d", codebook_idx);
return SRSLTE_ERROR;
}
#if SRSLTE_SIMD_CF_SIZE != 0
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
simd_cf_t h00, h01, h10, h11;
switch(codebook_idx) {
case 0:
h00 = h00i;
h01 = h10i;
h10 = h01i;
h11 = h11i;
break;
case 1:
h00 = srslte_simd_cf_add(h00i, h10i);
h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = srslte_simd_cf_add(h01i, h11i);
h11 = srslte_simd_cf_sub(h01i, h11i);
break;
case 2:
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
}
#endif /* SRSLTE_SIMD_CF_SIZE */
for (; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11;
switch(codebook_idx) {
@ -1498,130 +1541,58 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
return SRSLTE_SUCCESS;
}
#ifdef LV_HAVE_AVX
// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
int srslte_predecoding_multiplex_2x1_mrc_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
for (int i = 0; i < nof_symbols - 3; i += 4) {
__m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i]));
__m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i]));
__m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i]));
__m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i]));
__m256 h0, h1;
switch (codebook_idx) {
case 0:
h0 = _mm256_add_ps(_h00, _h10);
h1 = _mm256_add_ps(_h01, _h11);
break;
case 1:
h0 = _mm256_sub_ps(_h00, _h10);
h1 = _mm256_sub_ps(_h01, _h11);
break;
case 2:
h0 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
h1 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
break;
case 3:
h0 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
h1 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m256 h0_2 = _mm256_mul_ps(h0, h0);
__m256 h1_2 = _mm256_mul_ps(h1, h1);
__m256 hh0 = _mm256_add_ps(_mm256_movehdup_ps(h0_2), _mm256_moveldup_ps(h0_2));
__m256 hh1 = _mm256_add_ps(_mm256_movehdup_ps(h1_2), _mm256_moveldup_ps(h1_2));
__m256 hh = _mm256_add_ps(hh0, hh1);
__m256 hhrec = _mm256_rcp_ps(hh);
hhrec = _mm256_mul_ps(hhrec, _mm256_set1_ps((float) M_SQRT2 / scaling));
__m256 y0 = _mm256_load_ps((float*)&y[0][i]);
__m256 y1 = _mm256_load_ps((float*)&y[1][i]);
__m256 x0 = _mm256_add_ps(_MM256_PROD_PS(_MM256_CONJ_PS(h0), y0), _MM256_PROD_PS(_MM256_CONJ_PS(h1), y1));
x0 = _mm256_mul_ps(hhrec, x0);
_mm256_store_ps((float*)&x[0][i], x0);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_AVX */
// SSE implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
#ifdef LV_HAVE_SSE
int srslte_predecoding_multiplex_2x1_mrc_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
for (int i = 0; i < nof_symbols - 1; i += 2) {
__m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
__m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
__m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
__m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
__m128 h0, h1;
switch (codebook_idx) {
case 0:
h0 = _mm_add_ps(_h00, _h10);
h1 = _mm_add_ps(_h01, _h11);
break;
case 1:
h0 = _mm_sub_ps(_h00, _h10);
h1 = _mm_sub_ps(_h01, _h11);
break;
case 2:
h0 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10));
h1 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11));
break;
case 3:
h0 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10));
h1 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m128 h0_2 = _mm_mul_ps(h0, h0);
__m128 h1_2 = _mm_mul_ps(h1, h1);
__m128 hh0 = _mm_add_ps(_mm_movehdup_ps(h0_2), _mm_moveldup_ps(h0_2));
__m128 hh1 = _mm_add_ps(_mm_movehdup_ps(h1_2), _mm_moveldup_ps(h1_2));
__m128 hh = _mm_add_ps(hh0, hh1);
__m128 hhrec = _mm_rcp_ps(hh);
hhrec = _mm_mul_ps(hhrec, _mm_set1_ps((float) M_SQRT2 / scaling));
__m128 y0 = _mm_load_ps((float*)&y[0][i]);
__m128 y1 = _mm_load_ps((float*)&y[1][i]);
__m128 x0 = _mm_add_ps(_MM_PROD_PS(_MM_CONJ_PS(h0), y0), _MM_PROD_PS(_MM_CONJ_PS(h1), y1));
x0 = _mm_mul_ps(hhrec, x0);
_mm_store_ps((float*)&x[0][i], x0);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_SSE */
// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
// Implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
static int srslte_predecoding_multiplex_2x1_mrc(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
int codebook_idx,
int nof_symbols,
float scaling) {
float norm = (float) M_SQRT2 / scaling;
int i = 0;
for (int i = 0; i < nof_symbols; i += 1) {
#if SRSLTE_SIMD_CF_SIZE != 0
simd_f_t _norm = srslte_simd_f_set1(norm);
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t x0 = srslte_simd_cf_set1(0.0f);
simd_f_t hh = srslte_simd_f_set1(0.0f);
for (int k = 0; k < 2; k++) {
simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]);
simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]);
simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]);
simd_cf_t hx;
switch (codebook_idx) {
case 0:
hx = srslte_simd_cf_add(h0xi, h1xi);
break;
case 1:
hx = srslte_simd_cf_sub(h0xi, h1xi);
break;
case 2:
hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi));
break;
case 3:
hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh);
x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0);
}
hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh));
srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh));
}
#endif /* SRSLTE_SIMD_CF_SIZE */
for (; i < nof_symbols; i += 1) {
cf_t h0, h1;
float hh;
@ -1654,46 +1625,135 @@ int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[
return SRSLTE_SUCCESS;
}
int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
int nof_rxant, int nof_ports, int nof_layers, int codebook_idx, int nof_symbols,
float scaling, float noise_estimate)
{
// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
static int srslte_predecoding_multiplex_2x1_mrc_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi,
int codebook_idx,
int nof_symbols,
float scaling) {
float norm = (float) M_SQRT2 / scaling;
int i = 0;
#if SRSLTE_SIMD_CF_SIZE != 0
simd_f_t _norm = srslte_simd_f_set1(norm);
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t x0 = srslte_simd_cf_set1(0.0f);
simd_f_t hh = srslte_simd_f_set1(0.0f);
for (int k = 0; k < 2; k++) {
simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]);
simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]);
simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]);
simd_cf_t hx;
switch (codebook_idx) {
case 0:
hx = srslte_simd_cf_add(h0xi, h1xi);
break;
case 1:
hx = srslte_simd_cf_sub(h0xi, h1xi);
break;
case 2:
hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi));
break;
case 3:
hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh);
x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0);
}
hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh));
srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh));
srslte_simd_f_store(&csi[i], srslte_simd_f_mul(srslte_simd_f_rcp(hh), srslte_simd_f_set1((float) M_SQRT1_2)));
}
#endif /* SRSLTE_SIMD_CF_SIZE */
for (; i < nof_symbols; i += 1) {
cf_t h0, h1;
float hh, _csi;
switch (codebook_idx) {
case 0:
h0 = h[0][0][i] + h[1][0][i];
h1 = h[0][1][i] + h[1][1][i];
break;
case 1:
h0 = h[0][0][i] - h[1][0][i];
h1 = h[0][1][i] - h[1][1][i];
break;
case 2:
h0 = h[0][0][i] + _Complex_I * h[1][0][i];
h1 = h[0][1][i] + _Complex_I * h[1][1][i];
break;
case 3:
h0 = h[0][0][i] - _Complex_I * h[1][0][i];
h1 = h[0][1][i] - _Complex_I * h[1][1][i];
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
_csi = crealf(h0) * crealf(h0) + cimagf(h0) * cimagf(h0) + crealf(h1) * crealf(h1) + cimagf(h1) * cimagf(h1);
hh = norm / _csi;
x[0][i] = (conjf(h0) * y[0][i] + conjf(h1) * y[1][i]) * hh;
csi[i] = _csi / norm * (float) M_SQRT1_2;
}
return SRSLTE_SUCCESS;
}
static int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant,
int nof_ports,
int nof_layers,
int codebook_idx,
int nof_symbols,
float scaling,
float noise_estimate) {
if (nof_ports == 2 && nof_rxant <= 2) {
if (nof_layers == 2) {
switch (mimo_decoder) {
case SRSLTE_MIMO_DECODER_ZF:
#ifdef LV_HAVE_AVX
return srslte_predecoding_multiplex_2x2_zf_avx(y, h, x, codebook_idx, nof_symbols, scaling);
#else
#ifdef LV_HAVE_SSE
return srslte_predecoding_multiplex_2x2_zf_sse(y, h, x, codebook_idx, nof_symbols, scaling);
#else
return srslte_predecoding_multiplex_2x2_zf_gen(y, h, x, codebook_idx, nof_symbols, scaling);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
if (csi && csi[0]) {
return srslte_predecoding_multiplex_2x2_zf_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling);
} else {
return srslte_predecoding_multiplex_2x2_zf(y, h, x, codebook_idx, nof_symbols, scaling);
}
break;
case SRSLTE_MIMO_DECODER_MMSE:
#ifdef LV_HAVE_AVX
return srslte_predecoding_multiplex_2x2_mmse_avx(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
#else
#ifdef LV_HAVE_SSE
return srslte_predecoding_multiplex_2x2_mmse_sse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
#else
return srslte_predecoding_multiplex_2x2_mmse_gen(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
if (csi && csi[0]) {
return srslte_predecoding_multiplex_2x2_mmse_csi(y,
h,
x,
csi,
codebook_idx,
nof_symbols,
scaling,
noise_estimate);
} else {
return srslte_predecoding_multiplex_2x2_mmse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
}
break;
}
} else {
#ifdef LV_HAVE_AVX
return srslte_predecoding_multiplex_2x1_mrc_avx(y, h, x, codebook_idx, nof_symbols, scaling);
#else
#ifdef LV_HAVE_SSE
return srslte_predecoding_multiplex_2x1_mrc_sse(y, h, x, codebook_idx, nof_symbols, scaling);
#else
return srslte_predecoding_multiplex_2x1_mrc_gen(y, h, x, codebook_idx, nof_symbols, scaling);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
if (csi && csi[0]) {
return srslte_predecoding_multiplex_2x1_mrc_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling);
} else {
return srslte_predecoding_multiplex_2x1_mrc(y, h, x, codebook_idx, nof_symbols, scaling);
}
}
} else if (nof_ports == 4) {
DEBUG("Error predecoding multiplex: not implemented for %d Tx ports", nof_ports);
@ -1759,7 +1819,7 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS]
}
break;
case SRSLTE_MIMO_TYPE_SPATIAL_MULTIPLEX:
return srslte_predecoding_multiplex(y, h, x, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols,
return srslte_predecoding_multiplex(y, h, x, csi, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols,
scaling, noise_estimate);
default:
return SRSLTE_ERROR;