Added srs_lte_cpy for aligned copy which improves a bit performance for aligned data

2017-08-17 10:19:19 +02:00 · 2017-08-17 10:19:19 +02:00 · 48d508aeba
parent a9d9c92205
commit 48d508aeba
4 changed files with 34 additions and 4 deletions
--- a/lib/include/srslte/phy/utils/vector.h
+++ b/lib/include/srslte/phy/utils/vector.h
@ -172,6 +172,9 @@ SRSLTE_API void srslte_vec_abs_square_cf(cf_t *x, float *abs_square, uint32_t le
 /* argument of each vector element */
 SRSLTE_API void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len);

+/* Copy 256 bit aligned vector */
+SRSLTE_API void srs_vec_cf_cpy(cf_t *src, cf_t *dst, int len);
+
 #ifdef __cplusplus
 }
 #endif
--- a/lib/src/phy/mimo/layermap.c
+++ b/lib/src/phy/mimo/layermap.c
@ -28,6 +28,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
+#include <srslte/phy/utils/vector.h>

 #include "srslte/phy/common/phy_common.h"
 #include "srslte/phy/mimo/layermap.h"
@ -51,7 +52,12 @@ int srslte_layermap_diversity(cf_t *d, cf_t *x[SRSLTE_MAX_LAYERS], int nof_layer

 int srslte_layermap_multiplex(cf_t *d[SRSLTE_MAX_CODEWORDS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_cw, int nof_layers,
    int nof_symbols[SRSLTE_MAX_CODEWORDS]) {
-  if (nof_cw == 1) {
+  if (nof_cw == nof_layers) {
+    for (int i = 0; i < nof_cw; i++) {
+      srs_vec_cf_cpy(x[i], d[i], (uint32_t) nof_symbols[0]);
+    }
+    return nof_symbols[0];
+  } else if (nof_cw == 1) {
    return srslte_layermap_diversity(d[0], x, nof_layers, nof_symbols[0]);
  } else {
    int n[2];
--- a/lib/src/phy/mimo/test/layermap_test.c
+++ b/lib/src/phy/mimo/test/layermap_test.c
@ -96,19 +96,19 @@ int main(int argc, char **argv) {
  }

  for (i=0;i<nof_cw;i++) {
-    d[i] = malloc(sizeof(cf_t) * nof_symb_cw[i]);
+    d[i] = srslte_vec_malloc(sizeof(cf_t) * nof_symb_cw[i]);
    if (!d[i]) {
      perror("malloc");
      exit(-1);
    }
-    dp[i] = malloc(sizeof(cf_t) * nof_symb_cw[i]);
+    dp[i] = srslte_vec_malloc(sizeof(cf_t) * nof_symb_cw[i]);
    if (!dp[i]) {
      perror("malloc");
      exit(-1);
    }
  }
  for (i=0;i<nof_layers;i++) {
-    x[i] = malloc(sizeof(cf_t) * nof_symbols);
+    x[i] = srslte_vec_malloc(sizeof(cf_t) * nof_symbols);
    if (!x[i]) {
      perror("malloc");
      exit(-1);
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@ -843,3 +843,24 @@ void srslte_vec_quant_suc(int16_t *in, uint8_t *out, float gain, int16_t offset,
  }
 }

+void srs_vec_cf_cpy(cf_t *dst, cf_t *src, int len) {
+  int i = 0;
+
+#ifdef LV_HAVE_AVX
+    for (; i < len - 3; i += 4) {
+      _mm256_store_ps((float *) &dst[i], _mm256_load_ps((float *) &src[i]));
+    }
+#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_SSE
+    for (; i < len - 1; i += 2) {
+      _mm_store_ps((float *) &dst[i], _mm_load_ps((float *) &src[i]));
+    }
+  for (; i < len; i++) {
+    ((__m64*) dst)[i] = ((__m64*) src)[i];
+  }
+#else
+  for (; i < len; i++) {
+    dst[i] = src[i];
+  }
+#endif /* LV_HAVE_SSE */
+}