Added CSI Predecoding for TM4 with SIMD Support

2018-04-20 11:17:33 +02:00 · 2018-04-20 11:17:33 +02:00 · 7146819fcd
parent bad1291843
commit 7146819fcd
1 changed files with 488 additions and 428 deletions
--- a/lib/src/phy/mimo/precoding.c
+++ b/lib/src/phy/mimo/precoding.c
@ -1118,12 +1118,15 @@ int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS],
  return SRSLTE_ERROR;
 }

-#ifdef LV_HAVE_AVX
-
-// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
-int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
-  float norm = 1.0;
+static int srslte_predecoding_multiplex_2x2_zf_csi(cf_t *y[SRSLTE_MAX_PORTS],
+                                                   cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                                   cf_t *x[SRSLTE_MAX_LAYERS],
+                                                   float *csi,
+                                                   int codebook_idx,
+                                                   int nof_symbols,
+                                                   float scaling) {
+  float norm = 1.0f;
+  int i = 0;

  switch (codebook_idx) {
    case 0:
@ -1138,143 +1141,55 @@ int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
      return SRSLTE_ERROR;
  }

-  for (int i = 0; i < nof_symbols - 3; i += 4) {
-    __m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i]));
-    __m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i]));
-    __m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i]));
-    __m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i]));
+#if SRSLTE_SIMD_CF_SIZE != 0
+  for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
+    simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
+    simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
+    simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);

-    __m256 h00, h01, h10, h11;
+    simd_cf_t h00, h01, h10, h11;
    switch (codebook_idx) {
      case 0:
-        h00 = _h00;
-        h01 = _h10;
-        h10 = _h01;
-        h11 = _h11;
+        h00 = h00i;
+        h01 = h10i;
+        h10 = h01i;
+        h11 = h11i;
        break;
      case 1:
-        h00 = _mm256_add_ps(_h00, _h10);
-        h01 = _mm256_sub_ps(_h00, _h10);
-        h10 = _mm256_add_ps(_h01, _h11);
-        h11 = _mm256_sub_ps(_h01, _h11);
+        h00 = srslte_simd_cf_add(h00i, h10i);
+        h01 = srslte_simd_cf_sub(h00i, h10i);
+        h10 = srslte_simd_cf_add(h01i, h11i);
+        h11 = srslte_simd_cf_sub(h01i, h11i);
        break;
      case 2:
-        h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
-        h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
-        h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
-        h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
-        break;
-      default:
-        DEBUG("Wrong codebook_idx=%d\n", codebook_idx);
-        return SRSLTE_ERROR;
-    }
-
-    __m256 y0 = _mm256_load_ps((float *) &y[0][i]);
-    __m256 y1 = _mm256_load_ps((float *) &y[1][i]);
-
-    __m256 x0, x1;
-
-    srslte_mat_2x2_zf_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, norm);
-
-    _mm256_store_ps((float *) &x[0][i], x0);
-    _mm256_store_ps((float *) &x[1][i], x1);
-
-  }
-
-  return SRSLTE_SUCCESS;
-}
-
-#endif /* LV_HAVE_AVX */
-
-#ifdef LV_HAVE_SSE
-
-// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer
-int srslte_predecoding_multiplex_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
-  float norm = 1.0;
-
-  switch(codebook_idx) {
-    case 0:
-      norm = (float) M_SQRT2 / scaling;
-      break;
-    case 1:
-    case 2:
-      norm = 2.0f / scaling;
-      break;
-    default:
-      ERROR("Wrong codebook_idx=%d", codebook_idx);
-      return SRSLTE_ERROR;
-  }
-
-  for (int i = 0; i < nof_symbols - 1; i += 2) {
-    __m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
-    __m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
-    __m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
-    __m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
-
-    __m128 h00, h01, h10, h11;
-    switch (codebook_idx) {
-      case 0:
-        h00 = _h00;
-        h01 = _h10;
-        h10 = _h01;
-        h11 = _h11;
-        break;
-      case 1:
-        h00 = _mm_add_ps(_h00, _h10);
-        h01 = _mm_sub_ps(_h00, _h10);
-        h10 = _mm_add_ps(_h01, _h11);
-        h11 = _mm_sub_ps(_h01, _h11);
-        break;
-      case 2:
-        h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10));
-        h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10));
-        h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11));
-        h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11));
+        h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
+        h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
+        h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
+        h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
        break;
      default:
        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
        return SRSLTE_ERROR;
    }

-    __m128 y0 = _mm_load_ps((float *) &y[0][i]);
-    __m128 y1 = _mm_load_ps((float *) &y[1][i]);
+    simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
+    simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);

-    __m128 x0, x1;
+    simd_cf_t x0, x1;
+    simd_f_t csi0, csi1;
+    srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);

-    srslte_mat_2x2_zf_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, norm);
-
-    _mm_store_ps((float *) &x[0][i], x0);
-    _mm_store_ps((float *) &x[1][i], x1);
+    srslte_simd_cfi_store(&x[0][i], x0);
+    srslte_simd_cfi_store(&x[1][i], x1);

+    srslte_simd_f_store(&csi[i], csi0);
+    srslte_simd_f_store(&csi[i], csi1);
  }
+#endif /* SRSLTE_SIMD_CF_SIZE */

-  return SRSLTE_SUCCESS;
-}
-
-#endif /* LV_HAVE_SSE */
-
-
-// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
-int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
-  float norm = 1.0;
-
-  switch(codebook_idx) {
-    case 0:
-      norm = (float) M_SQRT2 / scaling;
-      break;
-    case 1:
-    case 2:
-      norm = 2.0f / scaling;
-      break;
-    default:
-      fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-      return SRSLTE_ERROR;
-  }
-
-  for (int i = 0; i < nof_symbols; i++) {
-    cf_t h00, h01, h10, h11, det;
+  for (; i < nof_symbols; i++) {
+    cf_t h00, h01, h10, h11;

    switch (codebook_idx) {
      case 0:
@ -1300,158 +1215,27 @@ int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
        return SRSLTE_ERROR;
    }

-    det = (h00 * h11 - h01 * h10);
+    cf_t det = (h00 * h11 - h01 * h10);
    det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));

    x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
    x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
+
+    csi[i] = 1.0f;
+    csi[i] = 1.0f;
  }
  return SRSLTE_SUCCESS;
 }

-#ifdef LV_HAVE_AVX
-
-// AVX implementation of ZF 2x2 Spatial Multiplexity equalizer
-int srslte_predecoding_multiplex_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
-                                        float scaling, float noise_estimate) {
-  float norm = 1.0;
-
-  switch(codebook_idx) {
-    case 0:
-      norm = (float) M_SQRT2 / scaling;
-      break;
-    case 1:
-    case 2:
-      norm = 2.0f / scaling;
-      break;
-    default:
-      fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-      return SRSLTE_ERROR;
-  }
-
-  for (int i = 0; i < nof_symbols; i += 4) {
-    __m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i]));
-    __m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i]));
-    __m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i]));
-    __m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i]));
-
-    __m256 h00, h01, h10, h11;
-    switch (codebook_idx) {
-      case 0:
-        h00 = _h00;
-        h01 = _h10;
-        h10 = _h01;
-        h11 = _h11;
-        break;
-      case 1:
-        h00 = _mm256_add_ps(_h00, _h10);
-        h01 = _mm256_sub_ps(_h00, _h10);
-        h10 = _mm256_add_ps(_h01, _h11);
-        h11 = _mm256_sub_ps(_h01, _h11);
-        break;
-      case 2:
-        h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
-        h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
-        h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
-        h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
-        break;
-      default:
-        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-        return SRSLTE_ERROR;
-    }
-
-    __m256 y0 = _mm256_load_ps((float *) &y[0][i]);
-    __m256 y1 = _mm256_load_ps((float *) &y[1][i]);
-
-    __m256 x0, x1;
-
-    srslte_mat_2x2_mmse_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm);
-
-    _mm256_store_ps((float *) &x[0][i], x0);
-    _mm256_store_ps((float *) &x[1][i], x1);
-
-  }
-
-  return SRSLTE_SUCCESS;
-}
-
-#endif /* LV_HAVE_AVX */
-
-
-#ifdef LV_HAVE_SSE
-
-// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer
-int srslte_predecoding_multiplex_2x2_mmse_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
-                                        float scaling, float noise_estimate) {
-  float norm;
-
-  switch(codebook_idx) {
-    case 0:
-      norm = (float) M_SQRT2 / scaling;
-      break;
-    case 1:
-    case 2:
-      norm = 2.0f / scaling;
-      break;
-    default:
-      fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-      return SRSLTE_ERROR;
-  }
-
-  for (int i = 0; i < nof_symbols - 1; i += 2) {
-    __m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
-    __m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
-    __m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
-    __m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
-
-    __m128 h00, h01, h10, h11;
-    switch (codebook_idx) {
-      case 0:
-        h00 = _h00;
-        h01 = _h10;
-        h10 = _h01;
-        h11 = _h11;
-        break;
-      case 1:
-        h00 = _mm_add_ps(_h00, _h10);
-        h01 = _mm_sub_ps(_h00, _h10);
-        h10 = _mm_add_ps(_h01, _h11);
-        h11 = _mm_sub_ps(_h01, _h11);
-        break;
-      case 2:
-        h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10));
-        h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10));
-        h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11));
-        h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11));
-        break;
-      default:
-        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-        return SRSLTE_ERROR;
-    }
-
-    __m128 y0 = _mm_load_ps((float *) &y[0][i]);
-    __m128 y1 = _mm_load_ps((float *) &y[1][i]);
-
-    __m128 x0, x1;
-
-    srslte_mat_2x2_mmse_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm);
-
-    _mm_store_ps((float *) &x[0][i], x0);
-    _mm_store_ps((float *) &x[1][i], x1);
-
-  }
-
-  return SRSLTE_SUCCESS;
-}
-#endif /* LV_HAVE_SSE */
-
 // Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
-int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
-                                        float scaling, float noise_estimate) {
-  float norm = 1.0;
+static int srslte_predecoding_multiplex_2x2_zf(cf_t *y[SRSLTE_MAX_PORTS],
+                                               cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                               cf_t *x[SRSLTE_MAX_LAYERS],
+                                               int codebook_idx,
+                                               int nof_symbols,
+                                               float scaling) {
+  float norm = 1.0f;
+  int i = 0;

  switch (codebook_idx) {
    case 0:
@ -1466,7 +1250,266 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
      return SRSLTE_ERROR;
  }

-  for (int i = 0; i < nof_symbols; i++) {
+#if SRSLTE_SIMD_CF_SIZE != 0
+  for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
+    simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
+    simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
+    simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
+
+    simd_cf_t h00, h01, h10, h11;
+    switch (codebook_idx) {
+      case 0:
+        h00 = h00i;
+        h01 = h10i;
+        h10 = h01i;
+        h11 = h11i;
+        break;
+      case 1:
+        h00 = srslte_simd_cf_add(h00i, h10i);
+        h01 = srslte_simd_cf_sub(h00i, h10i);
+        h10 = srslte_simd_cf_add(h01i, h11i);
+        h11 = srslte_simd_cf_sub(h01i, h11i);
+        break;
+      case 2:
+        h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
+        h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
+        h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
+        h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
+        break;
+      default:
+        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+        return SRSLTE_ERROR;
+    }
+
+    simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
+    simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
+
+    simd_cf_t x0, x1;
+    simd_f_t csi0, csi1;
+    srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);
+
+    srslte_simd_cfi_store(&x[0][i], x0);
+    srslte_simd_cfi_store(&x[1][i], x1);
+
+  }
+#endif /* SRSLTE_SIMD_CF_SIZE */
+
+  for (; i < nof_symbols; i++) {
+    cf_t h00, h01, h10, h11;
+
+    switch (codebook_idx) {
+      case 0:
+        h00 = h[0][0][i];
+        h01 = h[1][0][i];
+        h10 = h[0][1][i];
+        h11 = h[1][1][i];
+        break;
+      case 1:
+        h00 = h[0][0][i] + h[1][0][i];
+        h01 = h[0][0][i] - h[1][0][i];
+        h10 = h[0][1][i] + h[1][1][i];
+        h11 = h[0][1][i] - h[1][1][i];
+        break;
+      case 2:
+        h00 = h[0][0][i] + _Complex_I * h[1][0][i];
+        h01 = h[0][0][i] - _Complex_I * h[1][0][i];
+        h10 = h[0][1][i] + _Complex_I * h[1][1][i];
+        h11 = h[0][1][i] - _Complex_I * h[1][1][i];
+        break;
+      default:
+        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+        return SRSLTE_ERROR;
+    }
+
+    srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], norm);
+  }
+  return SRSLTE_SUCCESS;
+}
+
+// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
+static int srslte_predecoding_multiplex_2x2_mmse_csi(cf_t *y[SRSLTE_MAX_PORTS],
+                                                     cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                                     cf_t *x[SRSLTE_MAX_LAYERS],
+                                                     float *csi[SRSLTE_MAX_CODEWORDS],
+                                                     int codebook_idx,
+                                                     int nof_symbols,
+                                                     float scaling,
+                                                     float noise_estimate) {
+  float norm = 1.0f;
+  int i = 0;
+
+  switch (codebook_idx) {
+    case 0:
+      norm = (float) M_SQRT2 / scaling;
+      break;
+    case 1:
+    case 2:
+      norm = 2.0f / scaling;
+      break;
+    default:
+      ERROR("Wrong codebook_idx=%d", codebook_idx);
+      return SRSLTE_ERROR;
+  }
+
+#if SRSLTE_SIMD_CF_SIZE != 0
+  for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
+    simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
+    simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
+    simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
+
+    simd_cf_t h00, h01, h10, h11;
+    switch (codebook_idx) {
+      case 0:
+        h00 = h00i;
+        h01 = h10i;
+        h10 = h01i;
+        h11 = h11i;
+        break;
+      case 1:
+        h00 = srslte_simd_cf_add(h00i, h10i);
+        h01 = srslte_simd_cf_sub(h00i, h10i);
+        h10 = srslte_simd_cf_add(h01i, h11i);
+        h11 = srslte_simd_cf_sub(h01i, h11i);
+        break;
+      case 2:
+        h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
+        h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
+        h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
+        h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
+        break;
+      default:
+        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+        return SRSLTE_ERROR;
+    }
+
+    simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
+    simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
+
+    simd_cf_t x0, x1;
+    simd_f_t csi0, csi1;
+    srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
+
+    srslte_simd_cfi_store(&x[0][i], x0);
+    srslte_simd_cfi_store(&x[1][i], x1);
+
+    srslte_simd_f_store(&csi[0][i], csi0);
+    srslte_simd_f_store(&csi[1][i], csi1);
+  }
+#endif /* SRSLTE_SIMD_CF_SIZE */
+
+  for (; i < nof_symbols; i++) {
+    cf_t h00, h01, h10, h11;
+
+    switch (codebook_idx) {
+      case 0:
+        h00 = h[0][0][i];
+        h01 = h[1][0][i];
+        h10 = h[0][1][i];
+        h11 = h[1][1][i];
+        break;
+      case 1:
+        h00 = h[0][0][i] + h[1][0][i];
+        h01 = h[0][0][i] - h[1][0][i];
+        h10 = h[0][1][i] + h[1][1][i];
+        h11 = h[0][1][i] - h[1][1][i];
+        break;
+      case 2:
+        h00 = h[0][0][i] + _Complex_I * h[1][0][i];
+        h01 = h[0][0][i] - _Complex_I * h[1][0][i];
+        h10 = h[0][1][i] + _Complex_I * h[1][1][i];
+        h11 = h[0][1][i] - _Complex_I * h[1][1][i];
+        break;
+      default:
+        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+        return SRSLTE_ERROR;
+    }
+
+    srslte_mat_2x2_mmse_csi_gen(y[0][i],
+                                y[1][i],
+                                h00,
+                                h01,
+                                h10,
+                                h11,
+                                &x[0][i],
+                                &x[1][i],
+                                &csi[0][i],
+                                &csi[1][i],
+                                noise_estimate,
+                                norm);
+  }
+  return SRSLTE_SUCCESS;
+}
+
+static int srslte_predecoding_multiplex_2x2_mmse(cf_t *y[SRSLTE_MAX_PORTS],
+                                                 cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                                 cf_t *x[SRSLTE_MAX_LAYERS],
+                                                 int codebook_idx,
+                                                 int nof_symbols,
+                                                 float scaling,
+                                                 float noise_estimate) {
+  float norm = 1.0;
+  int i = 0;
+
+  switch(codebook_idx) {
+    case 0:
+      norm = (float) M_SQRT2 / scaling;
+      break;
+    case 1:
+    case 2:
+      norm = 2.0f / scaling;
+      break;
+    default:
+      ERROR("Wrong codebook_idx=%d", codebook_idx);
+      return SRSLTE_ERROR;
+  }
+
+#if SRSLTE_SIMD_CF_SIZE != 0
+  for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
+    simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
+    simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
+    simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
+
+    simd_cf_t h00, h01, h10, h11;
+    switch(codebook_idx) {
+      case 0:
+        h00 = h00i;
+        h01 = h10i;
+        h10 = h01i;
+        h11 = h11i;
+        break;
+      case 1:
+        h00 = srslte_simd_cf_add(h00i, h10i);
+        h01 = srslte_simd_cf_sub(h00i, h10i);
+        h10 = srslte_simd_cf_add(h01i, h11i);
+        h11 = srslte_simd_cf_sub(h01i, h11i);
+        break;
+      case 2:
+        h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
+        h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
+        h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
+        h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
+        break;
+      default:
+        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+        return SRSLTE_ERROR;
+    }
+
+    simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
+    simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
+
+    simd_cf_t x0, x1;
+    simd_f_t csi0, csi1;
+    srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
+
+    srslte_simd_cfi_store(&x[0][i], x0);
+    srslte_simd_cfi_store(&x[1][i], x1);
+  }
+#endif /* SRSLTE_SIMD_CF_SIZE */
+
+  for (; i < nof_symbols; i++) {
    cf_t h00, h01, h10, h11;

    switch(codebook_idx) {
@ -1498,130 +1541,58 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
  return SRSLTE_SUCCESS;
 }

-#ifdef LV_HAVE_AVX

-// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
-int srslte_predecoding_multiplex_2x1_mrc_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                             cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
-
-  for (int i = 0; i < nof_symbols - 3; i += 4) {
-    __m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i]));
-    __m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i]));
-    __m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i]));
-    __m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i]));
-
-    __m256 h0, h1;
-    switch (codebook_idx) {
-      case 0:
-        h0 = _mm256_add_ps(_h00, _h10);
-        h1 = _mm256_add_ps(_h01, _h11);
-        break;
-      case 1:
-        h0 = _mm256_sub_ps(_h00, _h10);
-        h1 = _mm256_sub_ps(_h01, _h11);
-        break;
-      case 2:
-        h0 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
-        h1 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
-        break;
-      case 3:
-        h0 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
-        h1 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
-        break;
-      default:
-        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-        return SRSLTE_ERROR;
-    }
-
-    __m256 h0_2 = _mm256_mul_ps(h0, h0);
-    __m256 h1_2 = _mm256_mul_ps(h1, h1);
-    __m256 hh0 = _mm256_add_ps(_mm256_movehdup_ps(h0_2), _mm256_moveldup_ps(h0_2));
-    __m256 hh1 = _mm256_add_ps(_mm256_movehdup_ps(h1_2), _mm256_moveldup_ps(h1_2));
-    __m256 hh = _mm256_add_ps(hh0, hh1);
-    __m256 hhrec = _mm256_rcp_ps(hh);
-
-    hhrec = _mm256_mul_ps(hhrec, _mm256_set1_ps((float) M_SQRT2 / scaling));
-    __m256 y0 = _mm256_load_ps((float*)&y[0][i]);
-    __m256 y1 = _mm256_load_ps((float*)&y[1][i]);
-
-    __m256 x0 = _mm256_add_ps(_MM256_PROD_PS(_MM256_CONJ_PS(h0), y0), _MM256_PROD_PS(_MM256_CONJ_PS(h1), y1));
-    x0 = _mm256_mul_ps(hhrec, x0);
-
-    _mm256_store_ps((float*)&x[0][i], x0);
-
-  }
-
-  return SRSLTE_SUCCESS;
-}
-
-#endif /* LV_HAVE_AVX */
-
-
-// SSE  implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
-#ifdef LV_HAVE_SSE
-
-int srslte_predecoding_multiplex_2x1_mrc_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                             cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
-
-  for (int i = 0; i < nof_symbols - 1; i += 2) {
-    __m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
-    __m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
-    __m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
-    __m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
-
-    __m128 h0, h1;
-    switch (codebook_idx) {
-      case 0:
-        h0 = _mm_add_ps(_h00, _h10);
-        h1 = _mm_add_ps(_h01, _h11);
-        break;
-      case 1:
-        h0 = _mm_sub_ps(_h00, _h10);
-        h1 = _mm_sub_ps(_h01, _h11);
-        break;
-      case 2:
-        h0 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10));
-        h1 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11));
-        break;
-      case 3:
-        h0 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10));
-        h1 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11));
-        break;
-      default:
-        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
-        return SRSLTE_ERROR;
-    }
-
-    __m128 h0_2 = _mm_mul_ps(h0, h0);
-    __m128 h1_2 = _mm_mul_ps(h1, h1);
-    __m128 hh0 = _mm_add_ps(_mm_movehdup_ps(h0_2), _mm_moveldup_ps(h0_2));
-    __m128 hh1 = _mm_add_ps(_mm_movehdup_ps(h1_2), _mm_moveldup_ps(h1_2));
-    __m128 hh = _mm_add_ps(hh0, hh1);
-    __m128 hhrec = _mm_rcp_ps(hh);
-
-    hhrec = _mm_mul_ps(hhrec, _mm_set1_ps((float) M_SQRT2 / scaling));
-
-    __m128 y0 = _mm_load_ps((float*)&y[0][i]);
-    __m128 y1 = _mm_load_ps((float*)&y[1][i]);
-
-    __m128 x0 = _mm_add_ps(_MM_PROD_PS(_MM_CONJ_PS(h0), y0), _MM_PROD_PS(_MM_CONJ_PS(h1), y1));
-    x0 = _mm_mul_ps(hhrec, x0);
-
-    _mm_store_ps((float*)&x[0][i], x0);
-
-  }
-
-  return SRSLTE_SUCCESS;
-}
-
-#endif /* LV_HAVE_SSE */
-
-// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
-int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
-                                        cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
+// Implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
+static int srslte_predecoding_multiplex_2x1_mrc(cf_t *y[SRSLTE_MAX_PORTS],
+                                                cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                                cf_t *x[SRSLTE_MAX_LAYERS],
+                                                int codebook_idx,
+                                                int nof_symbols,
+                                                float scaling) {
  float norm = (float) M_SQRT2 / scaling;
+  int i = 0;

-  for (int i = 0; i < nof_symbols; i += 1) {
+#if SRSLTE_SIMD_CF_SIZE != 0
+  simd_f_t _norm = srslte_simd_f_set1(norm);
+
+  for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t x0 = srslte_simd_cf_set1(0.0f);
+    simd_f_t hh = srslte_simd_f_set1(0.0f);
+
+    for (int k = 0; k < 2; k++) {
+      simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]);
+      simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]);
+      simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]);
+
+      simd_cf_t hx;
+      switch (codebook_idx) {
+        case 0:
+          hx = srslte_simd_cf_add(h0xi, h1xi);
+          break;
+        case 1:
+          hx = srslte_simd_cf_sub(h0xi, h1xi);
+          break;
+        case 2:
+          hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi));
+          break;
+        case 3:
+          hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi));
+          break;
+        default:
+          fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+          return SRSLTE_ERROR;
+      }
+
+      hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh);
+      x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0);
+    }
+
+    hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh));
+    srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh));
+  }
+#endif /* SRSLTE_SIMD_CF_SIZE */
+
+  for (; i < nof_symbols; i += 1) {
    cf_t h0, h1;
    float hh;

@ -1654,46 +1625,135 @@ int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[
  return SRSLTE_SUCCESS;
 }

-int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
-                                 int nof_rxant, int nof_ports, int nof_layers, int codebook_idx, int nof_symbols,
-                                 float scaling, float noise_estimate)
-{
+// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
+static int srslte_predecoding_multiplex_2x1_mrc_csi(cf_t *y[SRSLTE_MAX_PORTS],
+                                                    cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                                    cf_t *x[SRSLTE_MAX_LAYERS],
+                                                    float *csi,
+                                                    int codebook_idx,
+                                                    int nof_symbols,
+                                                    float scaling) {
+  float norm = (float) M_SQRT2 / scaling;
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE != 0
+  simd_f_t _norm = srslte_simd_f_set1(norm);
+
+  for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t x0 = srslte_simd_cf_set1(0.0f);
+    simd_f_t hh = srslte_simd_f_set1(0.0f);
+
+    for (int k = 0; k < 2; k++) {
+      simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]);
+      simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]);
+      simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]);
+
+      simd_cf_t hx;
+      switch (codebook_idx) {
+        case 0:
+          hx = srslte_simd_cf_add(h0xi, h1xi);
+          break;
+        case 1:
+          hx = srslte_simd_cf_sub(h0xi, h1xi);
+          break;
+        case 2:
+          hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi));
+          break;
+        case 3:
+          hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi));
+          break;
+        default:
+          fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+          return SRSLTE_ERROR;
+      }
+
+      hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh);
+      x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0);
+    }
+
+    hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh));
+    srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh));
+    srslte_simd_f_store(&csi[i], srslte_simd_f_mul(srslte_simd_f_rcp(hh), srslte_simd_f_set1((float) M_SQRT1_2)));
+  }
+#endif /* SRSLTE_SIMD_CF_SIZE */
+
+  for (; i < nof_symbols; i += 1) {
+    cf_t h0, h1;
+    float hh, _csi;
+
+    switch (codebook_idx) {
+      case 0:
+        h0 = h[0][0][i] + h[1][0][i];
+        h1 = h[0][1][i] + h[1][1][i];
+        break;
+      case 1:
+        h0 = h[0][0][i] - h[1][0][i];
+        h1 = h[0][1][i] - h[1][1][i];
+        break;
+      case 2:
+        h0 = h[0][0][i] + _Complex_I * h[1][0][i];
+        h1 = h[0][1][i] + _Complex_I * h[1][1][i];
+        break;
+      case 3:
+        h0 = h[0][0][i] - _Complex_I * h[1][0][i];
+        h1 = h[0][1][i] - _Complex_I * h[1][1][i];
+        break;
+      default:
+        fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
+        return SRSLTE_ERROR;
+    }
+
+    _csi = crealf(h0) * crealf(h0) + cimagf(h0) * cimagf(h0) + crealf(h1) * crealf(h1) + cimagf(h1) * cimagf(h1);
+    hh = norm / _csi;
+
+    x[0][i] = (conjf(h0) * y[0][i] + conjf(h1) * y[1][i]) * hh;
+    csi[i] = _csi / norm * (float) M_SQRT1_2;
+  }
+  return SRSLTE_SUCCESS;
+}
+
+static int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS],
+                                        cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
+                                        cf_t *x[SRSLTE_MAX_LAYERS],
+                                        float *csi[SRSLTE_MAX_CODEWORDS],
+                                        int nof_rxant,
+                                        int nof_ports,
+                                        int nof_layers,
+                                        int codebook_idx,
+                                        int nof_symbols,
+                                        float scaling,
+                                        float noise_estimate) {
  if (nof_ports == 2 && nof_rxant <= 2) {
    if (nof_layers == 2) {
      switch (mimo_decoder) {
        case SRSLTE_MIMO_DECODER_ZF:
-#ifdef LV_HAVE_AVX
-          return srslte_predecoding_multiplex_2x2_zf_avx(y, h, x, codebook_idx, nof_symbols, scaling);
-#else
-#ifdef LV_HAVE_SSE
-          return srslte_predecoding_multiplex_2x2_zf_sse(y, h, x, codebook_idx, nof_symbols, scaling);
-#else
-          return srslte_predecoding_multiplex_2x2_zf_gen(y, h, x, codebook_idx, nof_symbols, scaling);
-#endif /* LV_HAVE_SSE */
-#endif /* LV_HAVE_AVX */
+          if (csi && csi[0]) {
+            return srslte_predecoding_multiplex_2x2_zf_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling);
+          } else {
+            return srslte_predecoding_multiplex_2x2_zf(y, h, x, codebook_idx, nof_symbols, scaling);
+          }
          break;
        case SRSLTE_MIMO_DECODER_MMSE:
-#ifdef LV_HAVE_AVX
-          return srslte_predecoding_multiplex_2x2_mmse_avx(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
-#else
-#ifdef LV_HAVE_SSE
-          return srslte_predecoding_multiplex_2x2_mmse_sse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
-#else
-          return srslte_predecoding_multiplex_2x2_mmse_gen(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
-#endif /* LV_HAVE_SSE */
-#endif /* LV_HAVE_AVX */
+          if (csi && csi[0]) {
+            return srslte_predecoding_multiplex_2x2_mmse_csi(y,
+                                                             h,
+                                                             x,
+                                                             csi,
+                                                             codebook_idx,
+                                                             nof_symbols,
+                                                             scaling,
+                                                             noise_estimate);
+          } else {
+            return srslte_predecoding_multiplex_2x2_mmse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
+          }
          break;
      }
    } else {
-#ifdef LV_HAVE_AVX
-      return srslte_predecoding_multiplex_2x1_mrc_avx(y, h, x, codebook_idx, nof_symbols, scaling);
-#else
-#ifdef LV_HAVE_SSE
-      return srslte_predecoding_multiplex_2x1_mrc_sse(y, h, x, codebook_idx, nof_symbols, scaling);
-#else
-      return srslte_predecoding_multiplex_2x1_mrc_gen(y, h, x, codebook_idx, nof_symbols, scaling);
-#endif /* LV_HAVE_SSE */
-#endif /* LV_HAVE_AVX */
+      if (csi && csi[0]) {
+        return srslte_predecoding_multiplex_2x1_mrc_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling);
+      } else {
+        return srslte_predecoding_multiplex_2x1_mrc(y, h, x, codebook_idx, nof_symbols, scaling);
+      }
    }
  } else if (nof_ports == 4) {
    DEBUG("Error predecoding multiplex: not implemented for %d Tx ports", nof_ports);
@ -1759,7 +1819,7 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS]
    }
    break;
  case SRSLTE_MIMO_TYPE_SPATIAL_MULTIPLEX:
-    return srslte_predecoding_multiplex(y, h, x, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols,
+    return srslte_predecoding_multiplex(y, h, x, csi, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols,
                                        scaling, noise_estimate);
    default:
      return SRSLTE_ERROR;