Added more functions

2017-09-28 11:04:26 +02:00 · 2017-09-28 11:04:26 +02:00 · 9e5f999666
parent c41ad5453c
commit 9e5f999666
9 changed files with 939 additions and 294 deletions
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@ -88,6 +88,8 @@
 #define SRSLTE_SIMD_F_SIZE    16
 #define SRSLTE_SIMD_CF_SIZE   16

+#define SRSLTE_SIMD_I_SIZE    16
+
 #define SRSLTE_SIMD_S_SIZE    32
 #define SRSLTE_SIMD_C16_SIZE  0

@ -97,6 +99,8 @@
 #define SRSLTE_SIMD_F_SIZE    8
 #define SRSLTE_SIMD_CF_SIZE   8

+#define SRSLTE_SIMD_I_SIZE    8
+
 #define SRSLTE_SIMD_S_SIZE    16
 #define SRSLTE_SIMD_C16_SIZE  16

@ -106,6 +110,8 @@
 #define SRSLTE_SIMD_F_SIZE    4
 #define SRSLTE_SIMD_CF_SIZE   4

+#define SRSLTE_SIMD_I_SIZE    4
+
 #define SRSLTE_SIMD_S_SIZE    8
 #define SRSLTE_SIMD_C16_SIZE  8

@ -114,6 +120,8 @@
 #define SRSLTE_SIMD_F_SIZE    0
 #define SRSLTE_SIMD_CF_SIZE   0

+#define SRSLTE_SIMD_I_SIZE    0
+
 #define SRSLTE_SIMD_S_SIZE    0
 #define SRSLTE_SIMD_C16_SIZE  0

@ -223,6 +231,20 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
 #endif /* LV_HAVE_AVX512 */
 }

+static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_rcp_ps(a);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_rcp_ps(a);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return _mm_rcp_ps(a);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
 static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
  __m512 r = _mm512_add_ps(a, b);
@ -600,6 +622,61 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
  return ret;
 }

+static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
+    simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  b = _mm512_permutexvar_ps(b, _mm512_setr_epi32(0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
+  ret.re = _mm512_mul_ps(a.re, b);
+  ret.im = _mm512_mul_ps(a.im, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  b = _mm256_permutevar8x32_ps(b, _mm256_setr_epi32(0,4,1,5,2,6,3,7));
+  ret.re = _mm256_mul_ps(a.re, b);
+  ret.im = _mm256_mul_ps(a.im, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_mul_ps(a.re, b);
+  ret.im = _mm_mul_ps(a.im, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+    return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  simd_f_t a2re = _mm512_mul_ps(a.re, a.re);
+  simd_f_t a2im = _mm512_mul_ps(a.im, a.im);
+  simd_f_t mod2 = _mm512_add_ps(a2re, a2im);
+  simd_f_t rcp = _mm512_rcp_ps(mod2);
+  simd_f_t neg_a_im = _mm512_xor_ps(_mm512_set1_ps(-0.0f), a.im);
+  ret.re = _mm512_mul_ps(a.re, rcp);
+  ret.im = _mm512_mul_ps(neg_a_im, rcp);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  simd_f_t a2re = _mm256_mul_ps(a.re, a.re);
+  simd_f_t a2im = _mm256_mul_ps(a.im, a.im);
+  simd_f_t mod2 = _mm256_add_ps(a2re, a2im);
+  simd_f_t rcp = _mm256_rcp_ps(mod2);
+  simd_f_t neg_a_im = _mm256_xor_ps(_mm256_set1_ps(-0.0f), a.im);
+  ret.re = _mm256_mul_ps(a.re, rcp);
+  ret.im = _mm256_mul_ps(neg_a_im, rcp);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  simd_f_t a2re = _mm_mul_ps(a.re, a.re);
+  simd_f_t a2im = _mm_mul_ps(a.im, a.im);
+  simd_f_t mod2 = _mm_add_ps(a2re, a2im);
+  simd_f_t rcp = _mm_rcp_ps(mod2);
+  simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im);
+  ret.re = _mm_mul_ps(a.re, rcp);
+  ret.im = _mm_mul_ps(neg_a_im, rcp);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
 static inline simd_cf_t srslte_simd_cf_zero (void) {
  simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
@ -621,6 +698,106 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {

 #endif /* SRSLTE_SIMD_CF_SIZE */

+#if SRSLTE_SIMD_I_SIZE
+
+#ifdef LV_HAVE_AVX512
+typedef __m512i simd_i_t;
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+typedef __m256i simd_i_t;
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+typedef __m128i simd_i_t;
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+
+static inline simd_i_t srslte_simd_i_load(int *x) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_load_epi32((__m512i*)x);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_load_si256((__m256i*)x);
+#else
+  #ifdef LV_HAVE_SSE
+  return _mm_load_si128((__m128i*)x);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_i_store(int *x, simd_i_t reg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_store_epi32((__m512i*)x, reg);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_store_si256((__m256i*)x, reg);
+#else
+#ifdef LV_HAVE_SSE
+  _mm_store_si128((__m128i*)x, reg);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_i_set1(int x) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_set1_epi32(x);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_set1_epi32(x);
+#else
+  #ifdef LV_HAVE_SSE
+  return _mm_set1_epi32(x);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_add_epi32(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_add_epi32(a, b);
+#else
+#ifdef LV_HAVE_SSE
+  return _mm_add_epi32(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  return (simd_i_t) _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return (simd_i_t) _mm256_cmp_ps(a, b, _CMP_GT_OS);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return  (simd_i_t) _mm_cmpgt_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_i_t selector) {
+#ifdef LV_HAVE_AVX512
+  return (__m512i) _mm512_blendv_ps((__m512)a, (__m512) b, (__m512) selector);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b,(__m256) selector);
+#else
+  #ifdef LV_HAVE_SSE
+  return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, (__m128)selector);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+#endif /* SRSLTE_SIMD_I_SIZE*/
+

 #if SRSLTE_SIMD_S_SIZE

@ -829,6 +1006,20 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) {
  return ret;
 }

+static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX2
+  ret.re.m256 = _mm256_loadu_si256((__m256i*)(re));
+  ret.im.m256 = _mm256_loadu_si256((__m256i*)(im));
+#else
+#ifdef LV_HAVE_SSE
+  ret.re.m128 = _mm_loadu_si128((__m128i*)(re));
+  ret.im.m128 = _mm_loadu_si128((__m128i*)(im));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+  return ret;
+}
+
 static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
  __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
@ -845,6 +1036,22 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
 #endif /* LV_HAVE_AVX2 */
 }

+static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) {
+#ifdef LV_HAVE_AVX2
+  __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
+  __m256i im_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.im.m256, 0b10110001), 0b10110001);
+  _mm256_storeu_si256((__m256i *) (ptr), _mm256_blend_epi16(simdreg.re.m256, im_sw, 0b10101010));
+  _mm256_storeu_si256((__m256i *) (ptr + 8), _mm256_blend_epi16(re_sw, simdreg.im.m256, 0b10101010));
+#else
+#ifdef LV_HAVE_SSE
+  __m128i re_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.re.m128, 0b10110001), 0b10110001);
+  __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
+  _mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
+  _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
 static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
  _mm256_store_si256((__m256i *) re, simdreg.re.m256);
@ -857,6 +1064,18 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si
 #endif /* LV_HAVE_AVX2 */
 }

+static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t simdreg) {
+#ifdef LV_HAVE_AVX2
+  _mm256_storeu_si256((__m256i *) re, simdreg.re.m256);
+  _mm256_storeu_si256((__m256i *) im, simdreg.im.m256);
+#else
+#ifdef LV_HAVE_SSE
+  _mm_storeu_si128((__m128i *) re, simdreg.re.m128);
+  _mm_storeu_si128((__m128i *) im, simdreg.im.m128);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
 static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
  simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
@ -905,6 +1124,24 @@ static inline simd_c16_t srslte_simd_c16_zero (void) {

 #endif /* SRSLTE_SIMD_C16_SIZE */

+#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE

+static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX2
+  __m256 aa = _mm256_permute2f128_ps(a, b, 0x20);
+  __m256 bb = _mm256_permute2f128_ps(a, b, 0x31);
+  __m256i ai = _mm256_cvttps_epi32(aa);
+  __m256i bi = _mm256_cvttps_epi32(bb);
+  return _mm256_packs_epi32(ai, bi);
+#else
+#ifdef LV_HAVE_SSE
+  __m128i ai = _mm_cvttps_epi32(a);
+  __m128i bi = _mm_cvttps_epi32(b);
+  return _mm_packs_epi32(ai, bi);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
+#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_C16_SIZE */

 #endif //SRSLTE_SIMD_H_H
--- a/lib/include/srslte/phy/utils/vector.h
+++ b/lib/include/srslte/phy/utils/vector.h
@ -123,6 +123,7 @@ SRSLTE_API void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint

 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len);

 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
@ -142,8 +143,8 @@ SRSLTE_API float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len);
 SRSLTE_API int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len); 

 /* z=x/y vector division (element-wise) */
-SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len);
-void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len);
+SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len);

 /* conjugate */
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@ -36,26 +36,29 @@ extern "C" {
 #include "srslte/config.h"

 #ifdef LV_HAVE_AVX512
+#define SRSLTE_SIMD_BIT_ALIGN 512
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0)
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX
+#define SRSLTE_SIMD_BIT_ALIGN 256
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0)
 #else /* LV_HAVE_AVX */
 #ifdef LV_HAVE_SSE
+#define SRSLTE_SIMD_BIT_ALIGN 128
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
 #else /* LV_HAVE_SSE */
+#define SRSLTE_SIMD_BIT_ALIGN 64
 #define SRSLTE_IS_ALIGNED(PTR) (1)
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX */
 #endif /* LV_HAVE_AVX512 */

-SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
-
+/* SIMD Basic vector math */
 SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);

 SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);

-SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API float srslte_vec_acc_ff_simd(float *x, int len);

 SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len);

@ -63,59 +66,63 @@ SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len);

 SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len);

+/* SIMD Vector Scalar Product */
+SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
+
 SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len);

 SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len);

+/* SIMD Vector Product */
+SRSLTE_API void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
+
+SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
+                                             int16_t *r_im, int len);
+
+SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
+
+SRSLTE_API void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
+
 SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len);

 SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);

 SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);

-SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
+/* SIMD Division */
+SRSLTE_API void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);

-SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
-                                             int16_t *r_im, int len);
+SRSLTE_API void srslte_vec_div_cfc_simd(cf_t *x, float *y, cf_t *z, int len);

-SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
+SRSLTE_API void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len);

+/* SIMD Dot product */
 SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len);

 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len);

-SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
-
 SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len);

-SRSLTE_API  void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);
+SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);

+/* SIMD Modulus functions */
 SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len);

 SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len);

-SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
+/* Other Functions */
+SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len);

-SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len); 
-
-SRSLTE_API  void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); 
-
-SRSLTE_API void srslte_vec_mult_scalar_cf_f_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len);
-
-SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len);
-
-SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len);
-
-SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
+SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len);

 SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len);

+
+/* SIMD Find Max functions */
+SRSLTE_API uint32_t srslte_vec_max_fi_simd(float *x, int len);
+
+SRSLTE_API uint32_t srslte_vec_max_ci_simd(cf_t *x, int len);
+
 #ifdef __cplusplus
 }
 #endif
--- a/lib/src/phy/mimo/precoding.c
+++ b/lib/src/phy/mimo/precoding.c
@ -36,17 +36,16 @@

 #ifdef LV_HAVE_SSE
 #include <immintrin.h>
-#include "srslte/phy/utils/mat.h"
 int srslte_predecoding_single_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 int srslte_predecoding_diversity2_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_symbols);
 #endif

 #ifdef LV_HAVE_AVX
 #include <immintrin.h>
-#include "srslte/phy/utils/mat.h"
 int srslte_predecoding_single_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 #endif

+#include "srslte/phy/utils/mat.h"

 static srslte_mimo_decoder_t mimo_decoder = SRSLTE_MIMO_DECODER_MMSE;

--- a/lib/src/phy/sync/find_sss.c
+++ b/lib/src/phy/sync/find_sss.c
@ -70,14 +70,12 @@ static void corr_all_sz_partial(cf_t z[SRSLTE_SSS_N], float s[SRSLTE_SSS_N][SRSL

 static void extract_pair_sss(srslte_sss_synch_t *q, cf_t *input, cf_t *ce, cf_t y[2][SRSLTE_SSS_N]) {
  cf_t input_fft[SRSLTE_SYMBOL_SZ_MAX];
-  float ce_mod[2*SRSLTE_SSS_N], z_real[2*SRSLTE_SSS_N], z_imag[2*SRSLTE_SSS_N];

  srslte_dft_run_c(&q->dftp_input, input, input_fft);
  
  if (ce) {
-    srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce, ce_mod, 
-                       &input_fft[q->fft_size/2-SRSLTE_SSS_N], z_real, z_imag,
-                       2*SRSLTE_SSS_N);
+    srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce,
+                       &input_fft[q->fft_size/2-SRSLTE_SSS_N], 2*SRSLTE_SSS_N);
  }
  
  for (int i = 0; i < SRSLTE_SSS_N; i++) {
--- a/lib/src/phy/utils/test/CMakeLists.txt
+++ b/lib/src/phy/utils/test/CMakeLists.txt
@ -45,3 +45,4 @@ add_test(algebra_2x2_mmse_solver_test algebra_test -m)

 add_executable(vector_test vector_test.c)
 target_link_libraries(vector_test srslte_phy)
+add_test(vector_test vector_test)
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@ -89,6 +89,26 @@ float squared_error (cf_t a, cf_t b) {
  return diff_re*diff_re + diff_im*diff_im;
 }

+TEST(srslte_vec_acc_ff,
+     MALLOC(float, x);
+         float z;
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+         }
+
+         TEST_CALL(z = srslte_vec_acc_ff(x, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold += x[i];
+         }
+
+         mse += fabs(gold - z) / gold;
+
+         free(x);
+)
+
 TEST(srslte_vec_dot_prod_sss,
     MALLOC(int16_t, x);
         MALLOC(int16_t, y);
@ -314,6 +334,37 @@ TEST(srslte_vec_prod_ccc,
  free(z);
 )

+TEST(srslte_vec_prod_ccc_split,
+     MALLOC(float, x_re);
+     MALLOC(float, x_im);
+     MALLOC(float, y_re);
+     MALLOC(float, y_im);
+     MALLOC(float, z_re);
+     MALLOC(float, z_im);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x_re[i] = RANDOM_F();
+           x_im[i] = RANDOM_F();
+           y_re[i] = RANDOM_F();
+           y_im[i] = RANDOM_F();
+         }
+
+         TEST_CALL(srslte_vec_prod_ccc_split(x_re, x_im, y_re, y_im, z_re, z_im, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = (x_re[i] + I * x_im[i]) * (y_re[i] + I * y_im[i]);
+           mse += cabsf(gold - (z_re[i] + I*z_im[i]));
+         }
+
+         free(x_re);
+         free(x_im);
+         free(y_re);
+         free(y_im);
+         free(z_re);
+         free(z_im);
+)
+
 TEST(srslte_vec_prod_conj_ccc,
  MALLOC(cf_t, x);
  MALLOC(cf_t, y);
@ -357,6 +408,27 @@ TEST(srslte_vec_sc_prod_ccc,
  free(z);
 )

+TEST(srslte_vec_convert_fi,
+  MALLOC(float, x);
+  MALLOC(short, z);
+      float scale = 1000.0f;
+
+  short gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = (float) RANDOM_F();
+  }
+
+  TEST_CALL(srslte_vec_convert_fi(x, z, scale, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+      gold = (short) ((x[i] * scale));
+      mse += cabsf((float)gold - (float) z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
 TEST(srslte_vec_prod_fff,
  MALLOC(float, x);
  MALLOC(float, y);
@ -376,6 +448,30 @@ TEST(srslte_vec_prod_fff,
  }

  free(x);
+  free(y);
+  free(z);
+)
+
+TEST(srslte_vec_prod_cfc,
+  MALLOC(cf_t, x);
+  MALLOC(float, y);
+  MALLOC(cf_t, z);
+
+  cf_t gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+    y[i] = RANDOM_F();
+  }
+
+  TEST_CALL(srslte_vec_prod_cfc(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * y[i];
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(y);
  free(z);
 )

@ -461,66 +557,216 @@ TEST(srslte_vec_sc_prod_cfc,
  free(z);
 )

+TEST(srslte_vec_div_ccc,
+     MALLOC(cf_t, x);
+         MALLOC(cf_t, y);
+         MALLOC(cf_t, z);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+           y[i] = RANDOM_CF();
+         }
+
+         TEST_CALL(srslte_vec_div_ccc(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] / y[i];
+           mse += cabsf(gold - z[i]);
+         }
+         mse /= block_size;
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+
+TEST(srslte_vec_div_cfc,
+     MALLOC(cf_t, x);
+         MALLOC(float, y);
+         MALLOC(cf_t, z);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+           y[i] = RANDOM_F();
+         }
+
+         TEST_CALL(srslte_vec_div_cfc(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] / y[i];
+           mse += cabsf(gold - z[i])/cabsf(gold);
+         }
+         mse /= block_size;
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+
+TEST(srslte_vec_div_fff,
+     MALLOC(float, x);
+         MALLOC(float, y);
+         MALLOC(float, z);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+           y[i] = RANDOM_F();
+         }
+
+         TEST_CALL(srslte_vec_div_fff(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] / y[i];
+           mse += cabsf(gold - z[i]);
+         }
+         mse /= block_size;
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+TEST(srslte_vec_max_fi,
+     MALLOC(float, x);
+
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+         }
+
+         uint32_t max_index = 0;
+         TEST_CALL(max_index = srslte_vec_max_fi(x, block_size);)
+
+         float gold_value = -INFINITY;
+         uint32_t gold_index = 0;
+         for (int i = 0; i < block_size; i++) {
+           if (gold_value < x[i]) {
+             gold_value = x[i];
+             gold_index = i;
+           }
+         }
+         mse = (gold_index != max_index) ? 1:0;
+
+         free(x);
+)
+
+TEST(srslte_vec_max_abs_ci,
+     MALLOC(cf_t, x);
+
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+         }
+
+         uint32_t max_index = 0;
+         TEST_CALL(max_index = srslte_vec_max_abs_ci(x, block_size);)
+
+         float gold_value = -INFINITY;
+         uint32_t gold_index = 0;
+         for (int i = 0; i < block_size; i++) {
+           cf_t a = x[i];
+           float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
+           if (abs2 > gold_value) {
+             gold_value = abs2;
+             gold_index = (uint32_t)i;
+           }
+         }
+         mse = (gold_index != max_index) ? 1:0;
+
+         free(x);
+)
+
 int main(int argc, char **argv) {
  char func_names[MAX_FUNCTIONS][32];
  double timmings[MAX_FUNCTIONS][MAX_BLOCKS];
  uint32_t sizes[32];
  uint32_t size_count = 0;
  uint32_t func_count = 0;
-  bool passed = true;
+  bool passed[MAX_FUNCTIONS][MAX_BLOCKS];
+  bool all_passed = true;

  for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) {
    func_count = 0;

-    passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_acc_ff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_convert_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_ccc_split(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

-    passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_div_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_div_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_div_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

    sizes[size_count] = block_size;
@ -546,10 +792,11 @@ int main(int argc, char **argv) {
  for (int i = 0; i < func_count; i++) {
    printf("%32s | ", func_names[i]);
    for (int j = 0; j < size_count; j++) {
-      printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+      printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+      all_passed &= passed[i][j];
    }
    printf(" |\n");
  }

-  return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
+  return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
 }
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@ -48,18 +48,7 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {

 // Used in PRACH detector, AGC and chest_dl for noise averaging
 float srslte_vec_acc_ff(float *x, uint32_t len) {
-#ifdef HAVE_VOLK_ACC_FUNCTION
-  float result;
-  volk_32f_accumulator_s32f(&result,x,len);
-  return result;
-#else
-   int i;
-   float z=0;
-   for (i=0;i<len;i++) {
-     z+=x[i];
-   }
-   return z;
-#endif
+  return srslte_vec_acc_ff_simd(x, len);
 }

 void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len) {
@ -190,14 +179,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {

 // Chest UL 
 void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*h;
-  }
-#else
  srslte_vec_sc_prod_ccc_simd(x,h,z,len);
-#endif
 }

 // Used in turbo decoder 
@ -217,14 +199,7 @@ void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len) {
 }

 void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = (int16_t) (x[i]*scale);
-  }
-#else 
-  srslte_vec_convert_fi_sse(x, z, scale, len);
-#endif
+  srslte_vec_convert_fi_simd(x, z, scale, len);
 }

 void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
@ -234,13 +209,7 @@ void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
 }

 void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  for (int i=0;i<len;i++) {
-    y[lut[i]] = x[i];
-  }
-#else
-  srslte_vec_lut_sss_sse(x, lut, y, len);
-#endif
+  srslte_vec_lut_sss_simd(x, lut, y, len);
 }

 void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
@ -280,7 +249,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
 */
 void *srslte_vec_malloc(uint32_t size) {
  void *ptr;
-  if (posix_memalign(&ptr,512,size)) {
+  if (posix_memalign(&ptr, SRSLTE_SIMD_BIT_ALIGN, size)) {
    return NULL;
  } else {
    return ptr;
@ -292,7 +261,7 @@ void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
  return realloc(ptr, new_size);
 #else
  void *new_ptr;
-  if (posix_memalign(&new_ptr,256,new_size)) {
+  if (posix_memalign(&new_ptr, SRSLTE_SIMD_BIT_ALIGN, new_size)) {
    return NULL;
  } else {
    memcpy(new_ptr, ptr, old_size);
@ -415,6 +384,7 @@ void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) {

 // Used in PSS
 void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
+  /* This function is used in initialisation only, then no optimisation is required */
  int i;
  for (i=0;i<len;i++) {
    y[i] = conjf(x[i]);
@ -423,10 +393,7 @@ void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {

 // Used in scrambling complex 
 void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*y[i];
-  }
+  srslte_vec_prod_cfc_simd(x, y, z, len);
 }

 // Used in scrambling float
@ -444,6 +411,10 @@ void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
  srslte_vec_prod_ccc_simd(x,y,z,len);
 }

+void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len) {
+  srslte_vec_prod_ccc_split_simd(x_re, x_im, y_re , y_im, z_re,z_im, len);
+}
+
 // PRACH, CHEST UL, etc. 
 void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
  srslte_vec_prod_conj_ccc_simd(x,y,z,len);
@ -452,40 +423,17 @@ void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 //#define DIV_USE_VEC

 // Used in SSS 
-/* Complex division is conjugate multiplication + real division */
-void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
-#ifdef DIV_USE_VEC
-  srslte_vec_prod_conj_ccc(x,y,z,len);
-  srslte_vec_abs_square_cf(y,y_mod,len);
-  srslte_vec_div_cfc(z,y_mod,z,z_real,z_imag,len);  
-#else 
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i] / y[i]; 
-  }
-#endif
+void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
+  srslte_vec_div_ccc_simd(x, y, z, len);
 }

 /* Complex division by float z=x/y */
-void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
-#ifdef DIV_USE_VEC
-  srslte_vec_deinterleave_cf(x, z_real, z_imag, len);
-  srslte_vec_div_fff(z_real, y, z_real, len);
-  srslte_vec_div_fff(z_imag, y, z_imag, len);
-  srslte_vec_interleave_cf(z_real, z_imag, z, len);
-#else
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i] / y[i]; 
-  }
-#endif
+void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
+  srslte_vec_div_cfc_simd(x, y, z, len);
 }

 void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i] / y[i];
-  }
+  srslte_vec_div_fff_simd(x, y, z, len);
 }

 // PSS. convolution 
@ -554,30 +502,7 @@ void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
 }

 uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
-
-  // This is to solve an issue with incorrect type of 1st parameter in version 1.2 of volk
-#ifdef HAVE_VOLK_MAX_FUNCTION_32
-  uint32_t target=0;
-  volk_32f_index_max_32u(&target,x,len);
-  return target;
-#else
-#ifdef HAVE_VOLK_MAX_FUNCTION_16
-  uint32_t target=0;
-  volk_32f_index_max_16u(&target,x,len);
-  return target;
-#else
-  uint32_t i;
-  float m=-FLT_MAX;
-  uint32_t p=0;
-  for (i=0;i<len;i++) {
-    if (x[i]>m) {
-      m=x[i];
-      p=i;
-    }
-  }
-  return p;
-#endif
-#endif
+  return srslte_vec_max_fi_simd(x, len);
 }

 int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
@ -616,30 +541,7 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {

 // CP autocorr
 uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_32
-  uint32_t target=0;
-  volk_32fc_index_max_32u(&target,x,len);
-  return target;
-#else
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_16
-  uint32_t target=0;
-  volk_32fc_index_max_16u(&target,x,len);
-  return target;
-#else
-  uint32_t i;
-  float m=-FLT_MAX;
-  uint32_t p=0;
-  float tmp;
-  for (i=0;i<len;i++) {
-    tmp = crealf(x[i])*crealf(x[i]) + cimagf(x[i])*cimagf(x[i]);
-    if (tmp>m) {
-      m=tmp;
-      p=i;
-    }
-  }
-  return p;
-#endif
-#endif
+  return srslte_vec_max_ci_simd(x, len);
 }

 void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) {
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@ -232,143 +232,113 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)


 /* No improvement with AVX */
-void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
-{
-#ifdef DEBUG_MODE
-  for (int i=0;i<len;i++) {
+void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len) {
+  int i = 0;
+#ifdef LV_HAVE_SSE
+#if CMAKE_BUILD_TYPE!=Debug
+
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
+    for (; i < len - 7; i += 8) {
+      __m128i xVal = _mm_load_si128((__m128i *) &x[i]);
+      __m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
+
+      for (int k = 0; k < 8; k++) {
+        int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
+        uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
+        y[l] = (short) x;
+      }
+    }
+  } else {
+    for (; i < len - 7; i += 8) {
+      __m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
+      __m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
+
+      for (int k = 0; k < 8; k++) {
+        int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
+        uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
+        y[l] = (short) x;
+      }
+    }
+  }
+#endif
+#endif
+
+  for (; i < len; i++) {
    y[lut[i]] = x[i];
  }
-#else
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const __m128i* xPtr = (const __m128i*) x;
-  const __m128i* lutPtr = (__m128i*) lut;
-
-  __m128i xVal, lutVal;
-  for(;number < points; number++){
-
-    xVal   = _mm_loadu_si128(xPtr);
-    lutVal = _mm_loadu_si128(lutPtr);
-    
-    for (int i=0;i<8;i++) {
-      int16_t x = (int16_t)   _mm_extract_epi16(xVal, i); 
-      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
-      y[l] = x;
-    }
-    xPtr ++;
-    lutPtr ++;
-  }
-
-  number = points * 8;
-  for(;number < len; number++){
-    y[lut[number]] = x[number];
-  }
-#endif  
-#endif
 }

 /* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
-void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
+void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len) {
+  int i = 0;

-  const unsigned int eighthPoints = len / 8;
+#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
+  simd_f_t s = srslte_simd_f_set1(scale);
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&x[i + SRSLTE_SIMD_F_SIZE]);

-  const float* inputVectorPtr = (const float*)x;
-  int16_t* outputVectorPtr = z;
+      simd_f_t sa = srslte_simd_f_mul(a, s);
+      simd_f_t sb = srslte_simd_f_mul(b, s);

-  __m128 vScalar = _mm_set_ps1(scale);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-  __m128 ret1, ret2;
+      simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);

-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    ret1 = _mm_mul_ps(inputVal1, vScalar);
-    ret2 = _mm_mul_ps(inputVal2, vScalar);
-
-    intInputVal1 = _mm_cvtps_epi32(ret1);
-    intInputVal2 = _mm_cvtps_epi32(ret2);
-
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
+      srslte_simd_s_store(&z[i], i16);
    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&x[i + SRSLTE_SIMD_F_SIZE]);

-  number = eighthPoints * 8;
-  for(; number < len; number++){
-    z[number] = (int16_t) (x[number] * scale);
+      simd_f_t sa = srslte_simd_f_mul(a, s);
+      simd_f_t sb = srslte_simd_f_mul(b, s);
+
+      simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
+
+      srslte_simd_s_storeu(&z[i], i16);
+    }
+  }
+#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE */
+
+  for(; i < len; i++){
+    z[i] = (int16_t) (x[i] * scale);
  }
-#endif
 }

+float srslte_vec_acc_ff_simd(float *x, int len) {
+  int i = 0;
+  float acc_sum = 0.0f;

-// for enb no-volk
-void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 4;
+#if SRSLTE_SIMD_F_SIZE
+  simd_f_t simd_sum = srslte_simd_f_zero();

-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  float* zPtr = (float*) z;
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);

-  __m128 xVal, yVal, zVal;
-  for(;number < points; number++){
+      simd_sum = srslte_simd_f_add(simd_sum, a);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);

-    xVal = _mm_loadu_ps(xPtr);
-    yVal = _mm_loadu_ps(yPtr);
-
-    zVal = _mm_add_ps(xVal, yVal);
-
-    _mm_storeu_ps(zPtr, zVal);
-
-    xPtr += 4;
-    yPtr += 4;
-    zPtr += 4;
+      simd_sum = srslte_simd_f_add(simd_sum, a);
+    }
  }

-  number = points * 4;
-  for(;number < len; number++){
-    z[number] = x[number] + y[number];
+  __attribute__((aligned(SRSLTE_SIMD_F_SIZE*4))) float sum[SRSLTE_SIMD_F_SIZE];
+  srslte_simd_f_store(sum, simd_sum);
+  for (int k = 0; k < SRSLTE_SIMD_F_SIZE; k++) {
+    acc_sum += sum[k];
  }
 #endif
-}

-void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  float* zPtr = (float*) z;
-
-  __m256 xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm256_loadu_ps(xPtr);
-    yVal = _mm256_loadu_ps(yPtr);
-
-    zVal = _mm256_add_ps(xVal, yVal);
-
-    _mm256_storeu_ps(zPtr, zVal);
-
-    xPtr += 8;
-    yPtr += 8;
-    zPtr += 8;
+  for (; i<len; i++) {
+    acc_sum += x[i];
  }

-  for(number = points * 8;number < len; number++){
-    z[number] = x[number] + y[number];
-  }
-#endif
+  return acc_sum;
 }

 cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) {
@ -570,6 +540,34 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len)
  return result;
 }

+void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_f_t s = srslte_simd_f_load(&y[i]);
+
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t r = srslte_simd_cf_mul(a, s);
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t s = srslte_simd_f_loadu(&y[i]);
+
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t r = srslte_simd_cf_mul(a, s);
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i<len; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
 void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len) {
  int i = 0;

@ -630,10 +628,12 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
  }
 }

-void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
+void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
  int i = 0;

 #if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
+      SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
      simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
@ -642,6 +642,16 @@ void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b

      srslte_simd_cf_store(&r_re[i], &r_im[i], r);
    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cf_loadu(&a_re[i], &a_im[i]);
+      simd_cf_t b = srslte_simd_cf_loadu(&b_re[i], &b_im[i]);
+
+      simd_cf_t r = srslte_simd_cf_prod(a, b);
+
+      srslte_simd_cf_storeu(&r_re[i], &r_im[i], r);
+    }
+  }
 #endif

  for (; i<len; i++) {
@ -655,6 +665,8 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i
  int i = 0;

 #if SRSLTE_SIMD_C16_SIZE
+  if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
+      SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
    for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
      simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
      simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
@ -663,6 +675,16 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i

      srslte_simd_c16_store(&r_re[i], &r_im[i], r);
    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
+      simd_c16_t a = srslte_simd_c16_loadu(&a_re[i], &a_im[i]);
+      simd_c16_t b = srslte_simd_c16_loadu(&b_re[i], &b_im[i]);
+
+      simd_c16_t r = srslte_simd_c16_prod(a, b);
+
+      srslte_simd_c16_storeu(&r_re[i], &r_im[i], r);
+    }
+  }
 #endif

  for (; i<len; i++) {
@ -701,6 +723,103 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
  }
 }

+void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_load(&y[i]);
+
+      simd_cf_t rcpb = srslte_simd_cf_rcp(b);
+      simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
+
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
+
+      simd_cf_t rcpb = srslte_simd_cf_rcp(b);
+      simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
+
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+
+void srslte_vec_div_cfc_simd(cf_t *x,float *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE && SRSLTE_SIMD_CF_SIZE == SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
+
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
+
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_f_t r = srslte_simd_f_mul(a, rcpb);
+
+      srslte_simd_f_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_f_t r = srslte_simd_f_mul(a, rcpb);
+
+      srslte_simd_f_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
 void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
  int i = 0;

@ -895,3 +1014,137 @@ void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len) {
    dst[i] = src[i];
  }
 }
+
+uint32_t srslte_vec_max_fi_simd(float *x, int len) {
+  int i = 0;
+
+  float max_value = -INFINITY;
+  uint32_t max_index = 0;
+
+#if SRSLTE_SIMD_I_SIZE
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
+  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
+  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
+  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
+
+  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
+
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+
+      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+
+      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  }
+
+  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
+  srslte_simd_f_store(values_buffer, simd_max_values);
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
+    if (values_buffer[k] > max_value) {
+      max_value = values_buffer[k];
+      max_index = (uint32_t) indexes_buffer[k];
+    }
+  }
+#endif /* SRSLTE_SIMD_I_SIZE */
+
+  for (; i < len; i++) {
+    if (x[i] > max_value) {
+      max_value = x[i];
+      max_index = (uint32_t)i;
+    }
+  }
+
+  return max_index;
+}
+
+uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
+  int i = 0;
+
+  float max_value = -INFINITY;
+  uint32_t max_index = 0;
+
+#if SRSLTE_SIMD_I_SIZE
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
+  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
+  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
+  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
+
+  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
+
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+
+      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+
+      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  }
+
+  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
+  srslte_simd_f_store(values_buffer, simd_max_values);
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
+    if (values_buffer[k] > max_value) {
+      max_value = values_buffer[k];
+      max_index = (uint32_t) indexes_buffer[k];
+    }
+  }
+#endif /* SRSLTE_SIMD_I_SIZE */
+
+  for (; i < len; i++) {
+    cf_t a = x[i];
+    float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
+    if (abs2 > max_value) {
+      max_value = abs2;
+      max_index = (uint32_t)i;
+    }
+  }
+
+  return max_index;
+}