From 86750b2db73516e3bb6107594005c4b686135176 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Tue, 17 Jan 2017 11:31:03 +0100
Subject: [PATCH 01/55] removed volk dependency. Checked and working

---
 cmake/modules/FindVolk.cmake              | 144 ------------------
 srslte/CMakeLists.txt                     |  19 ---
 srslte/include/srslte/utils/vector_simd.h |  20 +++
 srslte/lib/CMakeLists.txt                 |   7 -
 srslte/lib/utils/vector.c                 | 167 ++++-----------------
 srslte/lib/utils/vector_simd.c            | 170 ++++++++++++++++++++++
 6 files changed, 222 insertions(+), 305 deletions(-)
 delete mode 100644 cmake/modules/FindVolk.cmake

diff --git a/cmake/modules/FindVolk.cmake b/cmake/modules/FindVolk.cmake
deleted file mode 100644
index 5dbe17cd5..000000000
--- a/cmake/modules/FindVolk.cmake
+++ /dev/null
@@ -1,144 +0,0 @@
-INCLUDE(FindPkgConfig)
-PKG_CHECK_MODULES(PC_VOLK volk QUIET)
-
-FIND_PATH(
-    VOLK_INCLUDE_DIRS
-    NAMES volk/volk.h
-    HINTS $ENV{VOLK_DIR}/include
-          ${CMAKE_INSTALL_PREFIX}/include
-          ${PC_VOLK_INCLUDE_DIR}
-    PATHS /usr/local/include
-          /usr/include
-)
-
-FIND_LIBRARY(
-    VOLK_LIBRARIES
-    NAMES volk
-    HINTS $ENV{VOLK_DIR}/lib
-          ${CMAKE_INSTALL_PREFIX}/lib
-          ${CMAKE_INSTALL_PREFIX}/lib64
-          ${PC_VOLK_LIBDIR}
-    PATHS /usr/local/lib
-          /usr/local/lib64
-          /usr/lib
-          /usr/lib64
-)
-
-INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
-MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS VOLK_DEFINITIONS)
-
-IF(VOLK_FOUND)
-  SET(CMAKE_REQUIRED_LIBRARIES ${VOLK_LIBRARIES} m)
-  CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_index_max_16u HAVE_VOLK_MAX_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_max_32f HAVE_VOLK_MAX_VEC_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_accumulator_s32f HAVE_VOLK_ACC_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_s32fc_multiply_32fc HAVE_VOLK_MULT_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_conjugate_32fc HAVE_VOLK_CONJ_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_multiply_32fc HAVE_VOLK_MULT2_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_multiply_conjugate_32fc HAVE_VOLK_MULT2_CONJ_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_32f_multiply_32fc HAVE_VOLK_MULT_REAL_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_s32f_multiply_32f HAVE_VOLK_MULT_FLOAT_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_magnitude_32f HAVE_VOLK_MAG_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_magnitude_squared_32f HAVE_VOLK_MAG_SQUARE_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_divide_32f HAVE_VOLK_DIVIDE_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_dot_prod_32fc HAVE_VOLK_DOTPROD_FC_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_32f_dot_prod_32fc HAVE_VOLK_DOTPROD_CFC_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_conjugate_dot_prod_32fc HAVE_VOLK_DOTPROD_CONJ_FC_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_dot_prod_32f HAVE_VOLK_DOTPROD_F_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_s32f_atan2_32f HAVE_VOLK_ATAN_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_s32f_convert_16i HAVE_VOLK_CONVERT_FI_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_deinterleave_32f_x2 HAVE_VOLK_DEINTERLEAVE_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_interleave_32fc HAVE_VOLK_INTERLEAVE_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_subtract_32f HAVE_VOLK_SUB_FLOAT_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_add_32f HAVE_VOLK_ADD_FLOAT_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_square_dist_32f HAVE_VOLK_SQUARE_DIST_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_deinterleave_real_32f HAVE_VOLK_DEINTERLEAVE_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32fc_index_max_16u HAVE_VOLK_MAX_ABS_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_multiply_32f HAVE_VOLK_MULT_REAL2_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_16i_max_star_16i HAVE_VOLK_MAX_STAR_S_FUNCTION)
-  CHECK_FUNCTION_EXISTS_MATH(volk_8i_convert_16i HAVE_VOLK_CONVERT_CI_FUNCTION)
-
-
-
-  SET(VOLK_DEFINITIONS "HAVE_VOLK")
-  IF(${HAVE_VOLK_CONVERT_CI_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_CONVERT_CI_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MAX_STAR_S_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MAX_STAR_S_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MAX_ABS_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MAX_ABS_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MAX_VEC_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MAX_VEC_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_DOTPROD_CONJ_FC_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_DOTPROD_CONJ_FC_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MAG_SQUARE_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MAG_SQUARE_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_SQUARE_DIST_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_SQUARE_DIST_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_DEINTERLEAVE_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_DEINTERLEAVE_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_INTERLEAVE_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_INTERLEAVE_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_SUB_FLOAT_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_SUB_FLOAT_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_ADD_FLOAT_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_ADD_FLOAT_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MULT2_CONJ_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MULT2_CONJ_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_DEINTERLEAVE_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_DEINTERLEAVE_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_CONVERT_FI_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_CONVERT_FI_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MAX_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MAX_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_ACC_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_ACC_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MULT_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MULT_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_CONJ_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_CONJ_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MULT2_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MULT2_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MULT_FLOAT_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MULT_FLOAT_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MULT_REAL_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MULT_REAL_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_MAG_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_MAG_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_DIVIDE_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_DIVIDE_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_DOTPROD_FC_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_DOTPROD_FC_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_DOTPROD_F_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_DOTPROD_F_FUNCTION")
-  ENDIF()
-  IF(${HAVE_VOLK_ATAN_FUNCTION})
-    SET(VOLK_DEFINITIONS "${VOLK_DEFINITIONS}; HAVE_VOLK_ATAN_FUNCTION")
-  ENDIF()
-ENDIF(VOLK_FOUND)
diff --git a/srslte/CMakeLists.txt b/srslte/CMakeLists.txt
index 0ecf9ddf6..82daf4df2 100644
--- a/srslte/CMakeLists.txt
+++ b/srslte/CMakeLists.txt
@@ -74,25 +74,6 @@ else(BLADERF_FOUND OR UHD_FOUND)
   add_definitions(-DDISABLE_RF)
 endif(BLADERF_FOUND OR UHD_FOUND)
 
-include(CheckFunctionExistsMath)
-if(${DISABLE_VOLK})
-  if(${DISABLE_VOLK} EQUAL 0)
-    find_package(Volk)
-  else(${DISABLE_VOLK} EQUAL 0)
-    message(STATUS "VOLK library disabled (DISABLE_VOLK=1)")
-  endif(${DISABLE_VOLK} EQUAL 0)
-else(${DISABLE_VOLK})
-  find_package(Volk)
-endif(${DISABLE_VOLK})
-
-if(VOLK_FOUND)
-  include_directories(${VOLK_INCLUDE_DIRS})
-  link_directories(${VOLK_LIBRARY_DIRS})
-  message(STATUS "   Compiling with VOLK SIMD library.")
-else(VOLK_FOUND)
-  message(STATUS "   VOLK SIMD library NOT found. Using generic implementation.")
-endif(VOLK_FOUND)
-
 ########################################################################
 # Add subdirectories
 ########################################################################
diff --git a/srslte/include/srslte/utils/vector_simd.h b/srslte/include/srslte/utils/vector_simd.h
index cd6eb4d28..cfdef5ecd 100644
--- a/srslte/include/srslte/utils/vector_simd.h
+++ b/srslte/include/srslte/utils/vector_simd.h
@@ -49,6 +49,26 @@ SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y,
 
 SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len); 
 
+SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
+
+SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *abs_square, uint32_t len);
+
+SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
+
+SRSLTE_API void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len); 
+
+SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
+
+SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len); 
+
+SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len); 
+
+SRSLTE_API float srslte_vec_acc_ff_simd(float *x, uint32_t len); 
+
+SRSLTE_API cf_t srslte_vec_dot_prod_cfc_simd(cf_t *x, float *y, uint32_t len); 
+
+SRSLTE_API void srslte_vec_convert_if_simd(int16_t *x, float *z, float scale, uint32_t len); 
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/srslte/lib/CMakeLists.txt b/srslte/lib/CMakeLists.txt
index 3a73d8761..1e7b11df6 100644
--- a/srslte/lib/CMakeLists.txt
+++ b/srslte/lib/CMakeLists.txt
@@ -90,13 +90,6 @@ if(RF_FOUND)
   endif(BLADERF_FOUND)
 endif(RF_FOUND)
 
-if(VOLK_FOUND)
-  target_link_libraries(srslte ${VOLK_LIBRARIES})
-  if(NOT DisableMEX)
-    target_link_libraries(srslte_static ${VOLK_LIBRARIES})
-  endif(NOT DisableMEX)
-endif(VOLK_FOUND)
-
 INSTALL(TARGETS srslte DESTINATION ${LIBRARY_DIR})
 SRSLTE_SET_PIC(srslte)
 
diff --git a/srslte/lib/utils/vector.c b/srslte/lib/utils/vector.c
index 469320177..578a644c2 100644
--- a/srslte/lib/utils/vector.c
+++ b/srslte/lib/utils/vector.c
@@ -35,10 +35,6 @@
 #include "srslte/utils/vector_simd.h"
 #include "srslte/utils/bit.h"
 
-#ifdef HAVE_VOLK
-#include "volk/volk.h"
-#endif
-
 int srslte_vec_acc_ii(int *x, uint32_t len) {
   int i;
   int z=0;
@@ -48,19 +44,14 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {
   return z;
 }
 
+// Used in PRACH detector
 float srslte_vec_acc_ff(float *x, uint32_t len) {
-#ifdef HAVE_VOLK_ACC_FUNCTION
-  float result;
-  volk_32f_accumulator_s32f(&result,x,len);
-  return result;
-#else
   int i;
   float z=0;
   for (i=0;i<len;i++) {
     z+=x[i];
   }
   return z;
-#endif
 }
 
 void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len) {
@@ -79,27 +70,19 @@ cf_t srslte_vec_acc_cc(cf_t *x, uint32_t len) {
 }
 
 void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints) {
-#ifndef HAVE_VOLK_SQUARE_DIST_FUNCTION
   uint32_t i;
   cf_t diff; 
   for (i=0;i<npoints;i++) {
     diff = symbol - points[i];
     distance[i] = crealf(diff) * crealf(diff) + cimagf(diff) * cimagf(diff);
   }
-#else
-  volk_32fc_x2_square_dist_32f(distance,&symbol,points,npoints);
-#endif 
 }
 
 void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
-#ifndef HAVE_VOLK_SUB_FLOAT_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]-y[i];
   }
-#else
-  volk_32f_x2_subtract_32f(z,x,y,len);
-#endif 
 }
 
 void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
@@ -117,14 +100,15 @@ void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
   return srslte_vec_sub_fff((float*) x,(float*) y,(float*) z, 2*len);
 }
 
+// Used in PSS/SSS and sum_ccc
 void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
-#ifndef HAVE_VOLK_ADD_FLOAT_FUNCTION
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]+y[i];
   }
 #else
-  volk_32f_x2_add_32f(z,x,y,len);
+  srslte_vec_sum_fff_simd(x, y, z, len);
 #endif
 }
 
@@ -179,14 +163,10 @@ void srslte_vec_sc_add_sss(int16_t *x, int16_t h, int16_t *z, uint32_t len) {
 }
 
 void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT_FLOAT_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*h;
   }
-#else
-  volk_32f_s32f_multiply_32f(z,x,h,len);
-#endif
 }
 
 void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
@@ -219,8 +199,9 @@ void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
   srslte_vec_sc_prod_cfc(x, amplitude/max, y, len);
 }
 
+// Used throughout 
 void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT_FUNCTION
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*h;
@@ -229,42 +210,36 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
   cf_t hh;
   __real__ hh = h;
   __imag__ hh = 0;
-  volk_32fc_s32fc_multiply_32fc(z,x,hh,len);
+  srslte_vec_sc_prod_ccc_simd(x,hh,z,len);
 #endif
 }
 
+// Chest UL 
 void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT_FUNCTION
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*h;
   }
 #else
-  volk_32fc_s32fc_multiply_32fc(z,x,h,len);
+  srslte_vec_sc_prod_ccc_simd(x,h,z,len);
 #endif
 }
 
+// Used in turbo decoder 
 void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len) {
-#ifndef HAVE_VOLK_CONVERT_IF_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     z[i] = ((float) x[i])/scale;
   }
-#else
-  volk_16i_s32f_convert_32f(z,x,scale,len);
-#endif  
 }
 
 
 void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len) {
-#ifndef HAVE_VOLK_CONVERT_CI_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     z[i] = ((int16_t) x[i]);
   }
-#else
-  volk_8i_convert_16i(z,x,len);
-#endif  
 }
 
 void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
@@ -295,37 +270,25 @@ void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
 }
 
 void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
- #ifdef HAVE_VOLK_INTERLEAVE_FUNCTION
-  volk_32f_x2_interleave_32fc(x, real, imag, len);
-#else 
   int i;
   for (i=0;i<len;i++) {
     x[i] = real[i] + _Complex_I*imag[i];
   }
-#endif 
 }
 
 void srslte_vec_deinterleave_cf(cf_t *x, float *real, float *imag, uint32_t len) {
- #ifdef HAVE_VOLK_DEINTERLEAVE_FUNCTION
-  volk_32fc_deinterleave_32f_x2(real, imag, x, len);
-#else 
   int i;
   for (i=0;i<len;i++) {
     real[i] = __real__ x[i];
     imag[i] = __imag__ x[i];
   }
-#endif 
 }
 
 void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
-#ifdef HAVE_VOLK_DEINTERLEAVE_REAL_FUNCTION
-  volk_32fc_deinterleave_real_32f(real, x, len);
-#else 
   int i;
   for (i=0;i<len;i++) {
     real[i] = __real__ x[i];
   }
-#endif  
 }
 
 /* Note: We align memory to 32 bytes (for AVX compatibility) 
@@ -335,7 +298,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
  */
 void *srslte_vec_malloc(uint32_t size) {
   void *ptr;
-  if (posix_memalign(&ptr,32,size)) {
+  if (posix_memalign(&ptr,64,size)) {
     return NULL;
   } else {
     return ptr;
@@ -343,11 +306,11 @@ void *srslte_vec_malloc(uint32_t size) {
 }
 
 void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
-#ifndef HAVE_VOLK
+#ifndef LV_HAVE_SSE
   return realloc(ptr, new_size);
 #else
   void *new_ptr;
-  if (posix_memalign(&new_ptr,volk_get_alignment(),new_size)) {
+  if (posix_memalign(&new_ptr,64,new_size)) {
     return NULL;
   } else {
     memcpy(new_ptr, ptr, old_size);
@@ -468,40 +431,31 @@ void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) {
   }  
 }
 
-
+// Used in PSS
 void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
-#ifndef HAVE_VOLK_CONJ_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     y[i] = conjf(x[i]);
   }
-#else
-  volk_32fc_conjugate_32fc(y,x,len);
-#endif
 }
 
+// Used in scrambling complex 
 void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT_REAL_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*y[i];
   }
-#else
-  volk_32fc_32f_multiply_32fc(z,x,y,len);
-#endif
 }
 
+// Used in scrambling float
 void srslte_vec_prod_fff(float *x, float *y, float *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT_REAL2_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*y[i];
   }
-#else
-  volk_32f_x2_multiply_32f(z,x,y,len);
-#endif
 }
 
+// Scrambling Short
 void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) {
 #ifndef LV_HAVE_SSE
   int i;
@@ -513,26 +467,27 @@ void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) {
 #endif
 }
 
+// CFO and OFDM processing
 void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT2_FUNCTION
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*y[i];
   }
 #else
-  volk_32fc_x2_multiply_32fc(z,x,y,len);
+  srslte_vec_prod_ccc_simd(x,y,z,len);
 #endif
 }
 
-
+// PRACH, CHEST UL, etc. 
 void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
-#ifndef HAVE_VOLK_MULT2_CONJ_FUNCTION
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*conjf(y[i]);
   }
 #else
-  volk_32fc_x2_multiply_conjugate_32fc(z,x,y,len);
+  srslte_vec_prod_conj_ccc_simd(x,y,z,len);
 #endif
 }
 
@@ -568,75 +523,48 @@ void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag
 }
 
 void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
-#ifdef HAVE_VOLK_DIVIDE_FUNCTION
-  volk_32f_x2_divide_32f(z, x, y, len);
-#else
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i] / y[i];
   }
-#endif
 }
 
 cf_t srslte_vec_dot_prod_ccc(cf_t *x, cf_t *y, uint32_t len) {
-#ifdef HAVE_VOLK_DOTPROD_FC_FUNCTION
-  cf_t res;
-  volk_32fc_x2_dot_prod_32fc(&res, x, y, len);
-  return res; 
-#else 
   uint32_t i;
   cf_t res = 0;
   for (i=0;i<len;i++) {
     res += x[i]*y[i];
   }
   return res;
-#endif
 }
 
+// Convolution filter 
 cf_t srslte_vec_dot_prod_cfc(cf_t *x, float *y, uint32_t len) {
-#ifdef HAVE_VOLK_DOTPROD_CFC_FUNCTION
-  cf_t res;
-  volk_32fc_32f_dot_prod_32fc(&res, x, y, len);
-  return res; 
-#else  
   uint32_t i;
   cf_t res = 0;
   for (i=0;i<len;i++) {
     res += x[i]*y[i];
   }
   return res;
-#endif
 }
 
 cf_t srslte_vec_dot_prod_conj_ccc(cf_t *x, cf_t *y, uint32_t len) {
-#ifdef HAVE_VOLK_DOTPROD_CONJ_FC_FUNCTION
-  cf_t res;
-  volk_32fc_x2_conjugate_dot_prod_32fc(&res, x, y, len);
-  return res; 
-#else 
   uint32_t i;
   cf_t res = 0;
   for (i=0;i<len;i++) {
     res += x[i]*conjf(y[i]);
   }
   return res;
-#endif
 }
 
-
+// PHICH 
 float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len) {
-#ifdef HAVE_VOLK_DOTPROD_F_FUNCTION
-  float res;
-  volk_32f_x2_dot_prod_32f(&res, x, y, len);
-  return res; 
-#else 
   uint32_t i;
   float res = 0;
   for (i=0;i<len;i++) {
     res += x[i]*y[i];
   }
   return res;
-#endif  
 }
 
 int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len) {
@@ -656,48 +584,35 @@ float srslte_vec_avg_power_cf(cf_t *x, uint32_t len) {
   return crealf(srslte_vec_dot_prod_conj_ccc(x,x,len)) / len;
 }
 
+// PSS
 void srslte_vec_abs_cf(cf_t *x, float *abs, uint32_t len) {
-#ifndef HAVE_VOLK_MAG_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     abs[i] = cabsf(x[i]);
   }
-#else
-  volk_32fc_magnitude_32f(abs,x,len);
-#endif
 }
+
+// PRACH 
 void srslte_vec_abs_square_cf(cf_t *x, float *abs_square, uint32_t len) {
-#ifndef HAVE_VOLK_MAG_SQUARE_FUNCTION
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     abs_square[i] = crealf(x[i])*crealf(x[i])+cimagf(x[i])*cimagf(x[i]);
   }
 #else
-  volk_32fc_magnitude_squared_32f(abs_square,x,len);
+  srslte_vec_abs_square_cf_simd(x,abs_square,len);
 #endif
 }
 
 
 void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
-#ifndef HAVE_VOLK_ATAN_FUNCTION
   int i;
   for (i=0;i<len;i++) {
     arg[i] = cargf(x[i]);
   }
-#else
-  volk_32fc_s32f_atan2_32f(arg,x,1,len);
-
-#endif
-
 }
 
 uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_FUNCTION
-  uint32_t target=0;
-  volk_32f_index_max_16u(&target,x,len);
-  return target;
-
-#else
   uint32_t i;
   float m=-FLT_MAX;
   uint32_t p=0;
@@ -708,16 +623,9 @@ uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
     }
   }
   return p;
-#endif
 }
 
 int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_STAR_S_FUNCTION
-  int16_t target=0;
-  volk_16i_max_star_16i(&target,x,len);
-  return target;
-
-#else
   uint32_t i;
   int16_t m=-INT16_MIN;
   for (i=0;i<len;i++) {
@@ -726,7 +634,6 @@ int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
     }
   }
   return m;
-#endif
 }
 
 int16_t srslte_vec_max_abs_star_si(int16_t *x, uint32_t len) {
@@ -741,9 +648,6 @@ int16_t srslte_vec_max_abs_star_si(int16_t *x, uint32_t len) {
 }
 
 void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_VEC_FUNCTION
-  volk_32f_x2_max_32f(z,x,y,len);
-#else
   uint32_t i; 
   for (i=0;i<len;i++) {
     if (x[i] > y[i]) {
@@ -752,17 +656,11 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {
       z[i] = y[i]; 
     }
   }
-#endif  
 }
 
 
+// CP autocorr
 uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION
-  uint32_t target=0;
-  volk_32fc_index_max_16u(&target,x,len);
-  return target;
-
-#else
   uint32_t i;
   float m=-FLT_MAX;
   uint32_t p=0;
@@ -775,7 +673,6 @@ uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
     }
   }
   return p;
-#endif
 }
 
 void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) {
diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index 1612f2c07..01a3d4c64 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -280,3 +280,173 @@ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
   }
 #endif
 }
+
+
+// for enb no-volk
+void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int points = len / 4;
+
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+  float* zPtr = (float*) z;
+
+  __m128 xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_load_ps(xPtr);
+    yVal = _mm_load_ps(yPtr);
+
+    zVal = _mm_add_ps(xVal, yVal);
+
+    _mm_store_ps(zPtr, zVal); 
+
+    xPtr += 4;
+    yPtr += 4;
+    zPtr += 4;
+  }
+
+  number = points * 4;
+  for(;number < len; number++){
+    z[number] = x[number] + y[number];
+  }
+#endif
+}
+
+static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
+  __m128 yl, yh, tmp1, tmp2;
+  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+  return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+}
+
+void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) 
+{
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int halfPoints = len / 2;
+
+  __m128 xVal, yVal, zVal;
+  float* zPtr = (float*) z;
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+
+  for(; number < halfPoints; number++){
+    xVal = _mm_load_ps(xPtr); 
+    yVal = _mm_load_ps(yPtr); 
+    zVal = _mm_complexmul_ps(xVal, yVal);
+    _mm_store_ps(zPtr, zVal); 
+
+    xPtr += 4;
+    yPtr += 4;
+    zPtr += 4;
+  }
+
+  if((len % 2) != 0){
+    *zPtr = (*xPtr) * (*yPtr);
+  }
+#endif
+}
+
+static inline __m128 _mm_complexmulconj_ps(__m128 x, __m128 y) {
+  const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+  y = _mm_xor_ps(y, conjugator); 
+  return _mm_complexmul_ps(x, y);
+}
+
+void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int halfPoints = len / 2;
+
+  __m128 xVal, yVal, zVal;
+  float* zPtr = (float*) z;
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+
+  for(; number < halfPoints; number++){
+    xVal = _mm_load_ps(xPtr); 
+    yVal = _mm_load_ps(yPtr); 
+    zVal = _mm_complexmulconj_ps(xVal, yVal);
+    _mm_store_ps(zPtr, zVal); 
+
+    xPtr += 4;
+    yPtr += 4;
+    zPtr += 4;
+  }
+
+  if((len % 2) != 0){
+    *zPtr = (*xPtr) * (*yPtr);
+  }
+#endif
+}
+
+void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int halfPoints = len / 2;
+
+  __m128 xVal, yl, yh, zVal, tmp1, tmp2;
+  float* zPtr = (float*) z;
+  const float* xPtr = (const float*) x;
+
+  // Set up constant scalar vector
+  yl = _mm_set_ps1(creal(h));
+  yh = _mm_set_ps1(cimag(h));
+
+  for(;number < halfPoints; number++){
+
+    xVal = _mm_load_ps(xPtr); 
+    tmp1 = _mm_mul_ps(xVal,yl); 
+    xVal = _mm_shuffle_ps(xVal,xVal,0xB1); 
+    tmp2 = _mm_mul_ps(xVal,yh); 
+    zVal = _mm_addsub_ps(tmp1,tmp2); 
+    _mm_storeu_ps(zPtr,zVal); 
+
+    xPtr += 4;
+    zPtr += 4;
+  }
+
+  if((len % 2) != 0) {
+    *zPtr = (*xPtr) * h;
+  }
+#endif
+}
+
+void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int quarterPoints = len / 4;
+
+  const float* xPtr = (const float*) x;
+  float* zPtr = z;
+
+  __m128 xVal1, xVal2, zVal;
+  for(; number < quarterPoints; number++){
+    xVal1 = _mm_load_ps(xPtr);
+    xPtr += 4;
+    xVal2 = _mm_load_ps(xPtr);
+    xPtr += 4;
+    xVal1 = _mm_mul_ps(xVal1, xVal1); 
+    xVal2 = _mm_mul_ps(xVal2, xVal2); 
+    zVal = _mm_hadd_ps(xVal1, xVal2);
+    _mm_store_ps(zPtr, zVal);
+    zPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < len; number++){
+    float val1Real = *xPtr++;
+    float val1Imag = *xPtr++;
+    *zPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+  }
+#endif
+}
+
+
+
+

From 40c161c2e6f74ed62765d7a6767f8d4a98295a8f Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 20 Jan 2017 11:50:16 +0100
Subject: [PATCH 02/55] ifdef for simd functions

---
 srslte/lib/utils/vector_simd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index 01a3d4c64..c150209ca 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -314,6 +314,7 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
 #endif
 }
 
+#ifdef LV_HAVE_SSE
 static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
   __m128 yl, yh, tmp1, tmp2;
   yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
@@ -323,6 +324,7 @@ static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
   tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
   return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
 }
+#endif
 
 void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) 
 {
@@ -352,11 +354,13 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
 #endif
 }
 
+#ifdef LV_HAVE_SSE
 static inline __m128 _mm_complexmulconj_ps(__m128 x, __m128 y) {
   const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
   y = _mm_xor_ps(y, conjugator); 
   return _mm_complexmul_ps(x, y);
 }
+#endif
 
 void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 #ifdef LV_HAVE_SSE

From 949d4b8df84a1f7895c03375c50ce6ed77eac155 Mon Sep 17 00:00:00 2001
From: Andre Puschmann <andre@softwareradiosystems.com>
Date: Wed, 25 Jan 2017 10:18:27 +0100
Subject: [PATCH 03/55] novolk: use unaligned load/store SSE intrinsics, allow
 debug builds

---
 srslte/lib/fec/rm_turbo.c      | 13 +++++++++----
 srslte/lib/utils/vector_simd.c | 18 ++++++++++--------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/srslte/lib/fec/rm_turbo.c b/srslte/lib/fec/rm_turbo.c
index 5856a4e46..751c9fe0c 100644
--- a/srslte/lib/fec/rm_turbo.c
+++ b/srslte/lib/fec/rm_turbo.c
@@ -327,8 +327,11 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
         lutVal = _mm_loadu_si128(lutPtr);
       
         for (int j=0;j<8;j++) {
-          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   j); 
-          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
+          // For -O0 builds: shuffle j-th element to pos 0 and extract from there
+          _mm_shuffle_epi8(xVal,_mm_set1_epi8(j));
+          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   0);
+          _mm_shuffle_epi8(lutVal,_mm_set1_epi8(j));
+          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
           output[l] += x;
         }
         xPtr ++;
@@ -346,8 +349,10 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
         lutVal = _mm_loadu_si128(lutPtr);
       
         for (int j=0;j<8;j++) {
-          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   j); 
-          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
+          _mm_shuffle_epi8(xVal,_mm_set1_epi8(j));
+          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   0);
+          _mm_shuffle_epi8(lutVal,_mm_set1_epi8(j));
+          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
           output[l] += x;
         }
         xPtr++;
diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index c150209ca..8d91c5f42 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -227,8 +227,10 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
     lutVal = _mm_load_si128(lutPtr);
     
     for (int i=0;i<8;i++) {
-      int16_t x = (int16_t)   _mm_extract_epi16(xVal, i); 
-      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
+      _mm_shuffle_epi8(xVal,_mm_set1_epi8(i));
+      int16_t x = (int16_t)   _mm_extract_epi16(xVal, 0);
+      _mm_shuffle_epi8(lutVal,_mm_set1_epi8(i));
+      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
       y[l] = x;
     }
     xPtr ++;
@@ -295,12 +297,12 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
   __m128 xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_ps(xPtr);
-    yVal = _mm_load_ps(yPtr);
+    xVal = _mm_loadu_ps(xPtr);
+    yVal = _mm_loadu_ps(yPtr);
 
     zVal = _mm_add_ps(xVal, yVal);
 
-    _mm_store_ps(zPtr, zVal); 
+    _mm_storeu_ps(zPtr, zVal);
 
     xPtr += 4;
     yPtr += 4;
@@ -338,10 +340,10 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
   const float* yPtr = (const float*) y;
 
   for(; number < halfPoints; number++){
-    xVal = _mm_load_ps(xPtr); 
-    yVal = _mm_load_ps(yPtr); 
+    xVal = _mm_loadu_ps(xPtr);
+    yVal = _mm_loadu_ps(yPtr);
     zVal = _mm_complexmul_ps(xVal, yVal);
-    _mm_store_ps(zPtr, zVal); 
+    _mm_storeu_ps(zPtr, zVal);
 
     xPtr += 4;
     yPtr += 4;

From 78de0c718b9f3059d7e47c8c2ce9b661fcdded8e Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 25 Jan 2017 15:20:47 +0100
Subject: [PATCH 04/55] fixed alignment problem in vec_abs_simd

---
 srslte/lib/utils/vector_simd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index c150209ca..25c1d2b7d 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -431,14 +431,14 @@ void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
 
   __m128 xVal1, xVal2, zVal;
   for(; number < quarterPoints; number++){
-    xVal1 = _mm_load_ps(xPtr);
+    xVal1 = _mm_loadu_ps(xPtr);
     xPtr += 4;
-    xVal2 = _mm_load_ps(xPtr);
+    xVal2 = _mm_loadu_ps(xPtr);
     xPtr += 4;
     xVal1 = _mm_mul_ps(xVal1, xVal1); 
     xVal2 = _mm_mul_ps(xVal2, xVal2); 
     zVal = _mm_hadd_ps(xVal1, xVal2);
-    _mm_store_ps(zPtr, zVal);
+    _mm_storeu_ps(zPtr, zVal);
     zPtr += 4;
   }
 

From f2a35c6dd1ec58e3db91029b50e7252ec441e6d2 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 25 Jan 2017 17:30:16 +0100
Subject: [PATCH 05/55] fixed tests with new simd functions

---
 CMakeLists.txt                            |   2 -
 srslte/include/srslte/utils/vector_simd.h |   2 +
 srslte/lib/utils/vector.c                 |  11 +--
 srslte/lib/utils/vector_simd.c            | 109 ++++++++++++++--------
 4 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 206139295..ae8e9c532 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,9 +115,7 @@ if(CMAKE_COMPILER_IS_GNUCC)
     if (HAVE_AVX2)
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx2 -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
     else (HAVE_AVX2)
-      message("NOT HAVE AVX2")
       if(HAVE_AVX)
-        message("HAVE AVX")
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
       elseif(HAVE_SSE)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -msse4.1 -Ofast -funroll-loops -DLV_HAVE_SSE")
diff --git a/srslte/include/srslte/utils/vector_simd.h b/srslte/include/srslte/utils/vector_simd.h
index cfdef5ecd..81aed443f 100644
--- a/srslte/include/srslte/utils/vector_simd.h
+++ b/srslte/include/srslte/utils/vector_simd.h
@@ -51,6 +51,8 @@ SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, ui
 
 SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
 
+SRSLTE_API void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len); 
+
 SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *abs_square, uint32_t len);
 
 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
diff --git a/srslte/lib/utils/vector.c b/srslte/lib/utils/vector.c
index 578a644c2..52e8b54d0 100644
--- a/srslte/lib/utils/vector.c
+++ b/srslte/lib/utils/vector.c
@@ -200,20 +200,19 @@ void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
 }
 
 // Used throughout 
-void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
+void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) { 
+  #ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*h;
   }
+  
 #else
-  cf_t hh;
-  __real__ hh = h;
-  __imag__ hh = 0;
-  srslte_vec_sc_prod_ccc_simd(x,hh,z,len);
+ srslte_vec_sc_prod_cfc_simd(x, h, z, len); 
 #endif
 }
 
+
 // Chest UL 
 void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
 #ifndef LV_HAVE_SSE
diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index c150209ca..09da6e36a 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -57,8 +57,8 @@ int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
+    xVal = _mm_loadu_si128(xPtr);
+    yVal = _mm_loadu_si128(yPtr);
 
     zVal = _mm_mullo_epi16(xVal, yVal);
 
@@ -69,7 +69,7 @@ int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len)
   }
   
   short dotProdVector[8];
-  _mm_store_si128((__m128i*) dotProdVector, dotProdVal);
+  _mm_storeu_si128((__m128i*) dotProdVector, dotProdVal);
   for (int i=0;i<8;i++) {
     result += dotProdVector[i]; 
   }
@@ -96,12 +96,12 @@ void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
+    xVal = _mm_loadu_si128(xPtr);
+    yVal = _mm_loadu_si128(yPtr);
 
     zVal = _mm_add_epi16(xVal, yVal);
 
-    _mm_store_si128(zPtr, zVal); 
+    _mm_storeu_si128(zPtr, zVal); 
 
     xPtr ++;
     yPtr ++;
@@ -129,12 +129,12 @@ void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
+    xVal = _mm_loadu_si128(xPtr);
+    yVal = _mm_loadu_si128(yPtr);
 
     zVal = _mm_sub_epi16(xVal, yVal);
 
-    _mm_store_si128(zPtr, zVal); 
+    _mm_storeu_si128(zPtr, zVal); 
 
     xPtr ++;
     yPtr ++;
@@ -161,12 +161,12 @@ void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
+    xVal = _mm_loadu_si128(xPtr);
+    yVal = _mm_loadu_si128(yPtr);
 
     zVal = _mm_mullo_epi16(xVal, yVal);
 
-    _mm_store_si128(zPtr, zVal); 
+    _mm_storeu_si128(zPtr, zVal); 
 
     xPtr ++;
     yPtr ++;
@@ -192,11 +192,11 @@ void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
   __m128i xVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_si128(xPtr);
+    xVal = _mm_loadu_si128(xPtr);
     
     zVal = _mm_srai_epi16(xVal, k);                 
       
-    _mm_store_si128(zPtr, zVal); 
+    _mm_storeu_si128(zPtr, zVal); 
 
     xPtr ++;
     zPtr ++;
@@ -223,8 +223,8 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
   __m128i xVal, lutVal;
   for(;number < points; number++){
 
-    xVal   = _mm_load_si128(xPtr);
-    lutVal = _mm_load_si128(lutPtr);
+    xVal   = _mm_loadu_si128(xPtr);
+    lutVal = _mm_loadu_si128(lutPtr);
     
     for (int i=0;i<8;i++) {
       int16_t x = (int16_t)   _mm_extract_epi16(xVal, i); 
@@ -295,12 +295,12 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
   __m128 xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_load_ps(xPtr);
-    yVal = _mm_load_ps(yPtr);
+    xVal = _mm_loadu_ps(xPtr);
+    yVal = _mm_loadu_ps(yPtr);
 
     zVal = _mm_add_ps(xVal, yVal);
 
-    _mm_store_ps(zPtr, zVal); 
+    _mm_storeu_ps(zPtr, zVal); 
 
     xPtr += 4;
     yPtr += 4;
@@ -338,18 +338,19 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
   const float* yPtr = (const float*) y;
 
   for(; number < halfPoints; number++){
-    xVal = _mm_load_ps(xPtr); 
-    yVal = _mm_load_ps(yPtr); 
+    xVal = _mm_loadu_ps(xPtr); 
+    yVal = _mm_loadu_ps(yPtr); 
     zVal = _mm_complexmul_ps(xVal, yVal);
-    _mm_store_ps(zPtr, zVal); 
+    _mm_storeu_ps(zPtr, zVal); 
 
     xPtr += 4;
     yPtr += 4;
     zPtr += 4;
   }
 
-  if((len % 2) != 0){
-    *zPtr = (*xPtr) * (*yPtr);
+  number = halfPoints * 2;
+  for(;number < len; number++){
+    z[number] = x[number] * y[number];
   }
 #endif
 }
@@ -373,18 +374,19 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
   const float* yPtr = (const float*) y;
 
   for(; number < halfPoints; number++){
-    xVal = _mm_load_ps(xPtr); 
-    yVal = _mm_load_ps(yPtr); 
+    xVal = _mm_loadu_ps(xPtr); 
+    yVal = _mm_loadu_ps(yPtr); 
     zVal = _mm_complexmulconj_ps(xVal, yVal);
-    _mm_store_ps(zPtr, zVal); 
+    _mm_storeu_ps(zPtr, zVal); 
 
     xPtr += 4;
     yPtr += 4;
     zPtr += 4;
   }
 
-  if((len % 2) != 0){
-    *zPtr = (*xPtr) * (*yPtr);
+  number = halfPoints * 2;
+  for(;number < len; number++){
+    z[number] = x[number] * conjf(y[number]);
   }
 #endif
 }
@@ -404,7 +406,7 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
 
   for(;number < halfPoints; number++){
 
-    xVal = _mm_load_ps(xPtr); 
+    xVal = _mm_loadu_ps(xPtr); 
     tmp1 = _mm_mul_ps(xVal,yl); 
     xVal = _mm_shuffle_ps(xVal,xVal,0xB1); 
     tmp2 = _mm_mul_ps(xVal,yh); 
@@ -415,12 +417,43 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
     zPtr += 4;
   }
 
-  if((len % 2) != 0) {
-    *zPtr = (*xPtr) * h;
+  number = halfPoints * 2;
+  for(;number < len; number++){
+    z[number] = x[number] * h;
   }
 #endif
 }
 
+
+void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int halfPoints = len / 2;
+
+  __m128 xVal, hVal, zVal;
+  float* zPtr = (float*) z;
+  const float* xPtr = (const float*) x;
+
+  // Set up constant scalar vector
+  hVal = _mm_set_ps1(h);
+  
+  for(;number < halfPoints; number++){
+
+    xVal = _mm_loadu_ps(xPtr); 
+    zVal = _mm_mul_ps(xVal,hVal); 
+    _mm_storeu_ps(zPtr,zVal); 
+
+    xPtr += 4;
+    zPtr += 4;
+  }
+
+  number = halfPoints * 2;
+  for(;number < len; number++){
+    z[number] = x[number] * h;
+  }
+
+#endif
+}
 void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
   unsigned int number = 0;
@@ -431,22 +464,20 @@ void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
 
   __m128 xVal1, xVal2, zVal;
   for(; number < quarterPoints; number++){
-    xVal1 = _mm_load_ps(xPtr);
+    xVal1 = _mm_loadu_ps(xPtr);
     xPtr += 4;
-    xVal2 = _mm_load_ps(xPtr);
+    xVal2 = _mm_loadu_ps(xPtr);
     xPtr += 4;
     xVal1 = _mm_mul_ps(xVal1, xVal1); 
     xVal2 = _mm_mul_ps(xVal2, xVal2); 
     zVal = _mm_hadd_ps(xVal1, xVal2);
-    _mm_store_ps(zPtr, zVal);
+    _mm_storeu_ps(zPtr, zVal);
     zPtr += 4;
   }
 
   number = quarterPoints * 4;
-  for(; number < len; number++){
-    float val1Real = *xPtr++;
-    float val1Imag = *xPtr++;
-    *zPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+  for(;number < len; number++){
+    z[number] = creal(x[number]) * creal(x[number]) + cimag(x[number])*cimag(x[number]);
   }
 #endif
 }

From e6e5b0468e0ef5976ebb85b8689ed2f71bafbee3 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 25 Jan 2017 17:38:22 +0100
Subject: [PATCH 06/55] restored rm_turbo (test not passing)

---
 srslte/lib/fec/rm_turbo.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/srslte/lib/fec/rm_turbo.c b/srslte/lib/fec/rm_turbo.c
index 751c9fe0c..31cbd82ca 100644
--- a/srslte/lib/fec/rm_turbo.c
+++ b/srslte/lib/fec/rm_turbo.c
@@ -327,11 +327,8 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
         lutVal = _mm_loadu_si128(lutPtr);
       
         for (int j=0;j<8;j++) {
-          // For -O0 builds: shuffle j-th element to pos 0 and extract from there
-          _mm_shuffle_epi8(xVal,_mm_set1_epi8(j));
-          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   0);
-          _mm_shuffle_epi8(lutVal,_mm_set1_epi8(j));
-          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
+          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   j); 
+          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
           output[l] += x;
         }
         xPtr ++;
@@ -349,10 +346,8 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
         lutVal = _mm_loadu_si128(lutPtr);
       
         for (int j=0;j<8;j++) {
-          _mm_shuffle_epi8(xVal,_mm_set1_epi8(j));
-          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   0);
-          _mm_shuffle_epi8(lutVal,_mm_set1_epi8(j));
-          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
+          int16_t x  = (int16_t)  _mm_extract_epi16(xVal,   j); 
+          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
           output[l] += x;
         }
         xPtr++;
@@ -718,4 +713,3 @@ int srslte_rm_turbo_rx(float *w_buff, uint32_t w_buff_len, float *input, uint32_
 
   return 0;
 }
-

From 2758ba4118bc9764552a5ea59c8eebe49d3f9c6a Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 25 Jan 2017 17:40:28 +0100
Subject: [PATCH 07/55] fixed lut in vector simd (now all tests passing)

---
 srslte/lib/utils/vector_simd.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index ba30e190f..16bbbc0bb 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -227,10 +227,8 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
     lutVal = _mm_loadu_si128(lutPtr);
     
     for (int i=0;i<8;i++) {
-      _mm_shuffle_epi8(xVal,_mm_set1_epi8(i));
-      int16_t x = (int16_t)   _mm_extract_epi16(xVal, 0);
-      _mm_shuffle_epi8(lutVal,_mm_set1_epi8(i));
-      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
+      int16_t x = (int16_t)   _mm_extract_epi16(xVal, i); 
+      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
       y[l] = x;
     }
     xPtr ++;

From 9acb1002e96c6e5ed655bc885749030b75463132 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 26 Jan 2017 17:48:32 +0100
Subject: [PATCH 08/55] added more functions to simd for UE

---
 srslte/include/srslte/utils/vector_simd.h |  10 +-
 srslte/lib/utils/vector.c                 |  33 ++++-
 srslte/lib/utils/vector_simd.c            | 164 +++++++++++++++++++++-
 3 files changed, 193 insertions(+), 14 deletions(-)

diff --git a/srslte/include/srslte/utils/vector_simd.h b/srslte/include/srslte/utils/vector_simd.h
index 81aed443f..8380a75de 100644
--- a/srslte/include/srslte/utils/vector_simd.h
+++ b/srslte/include/srslte/utils/vector_simd.h
@@ -53,11 +53,15 @@ SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
 
 SRSLTE_API void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len); 
 
+SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, uint32_t len); 
+
 SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *abs_square, uint32_t len);
 
 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
 
-SRSLTE_API void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len); 
+SRSLTE_API void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len);
+
+SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *h, float *z, uint32_t len); 
 
 SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
 
@@ -65,6 +69,10 @@ SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t
 
 SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len); 
 
+SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
+
+SRSLTE_API void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len); 
+
 SRSLTE_API float srslte_vec_acc_ff_simd(float *x, uint32_t len); 
 
 SRSLTE_API cf_t srslte_vec_dot_prod_cfc_simd(cf_t *x, float *y, uint32_t len); 
diff --git a/srslte/lib/utils/vector.c b/srslte/lib/utils/vector.c
index 52e8b54d0..10fe38165 100644
--- a/srslte/lib/utils/vector.c
+++ b/srslte/lib/utils/vector.c
@@ -44,7 +44,7 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {
   return z;
 }
 
-// Used in PRACH detector
+// Used in PRACH detector, AGC and chest_dl for noise averaging
 float srslte_vec_acc_ff(float *x, uint32_t len) {
   int i;
   float z=0;
@@ -79,10 +79,14 @@ void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t
 }
 
 void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]-y[i];
   }
+#else
+  srslte_vec_sub_fff_simd(x, y, z, len);
+#endif
 }
 
 void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
@@ -96,6 +100,7 @@ void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
 #endif
 }
 
+// Noise estimation in chest_dl, interpolation 
 void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
   return srslte_vec_sub_fff((float*) x,(float*) y,(float*) z, 2*len);
 }
@@ -161,12 +166,16 @@ void srslte_vec_sc_add_sss(int16_t *x, int16_t h, int16_t *z, uint32_t len) {
     z[i] += h;
   }
 }
-
+// PSS, PBCH, DEMOD, FFTW, etc. 
 void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len) {
+#ifndef LV_HAVE_SSE
   int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]*h;
   }
+#else
+  srslte_vec_sc_prod_fff_simd(x, h, z, len);
+#endif
 }
 
 void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
@@ -490,8 +499,9 @@ void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 #endif
 }
 
-#define DIV_USE_VEC
+//#define DIV_USE_VEC
 
+// Used in SSS 
 /* Complex division is conjugate multiplication + real division */
 void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
 #ifdef DIV_USE_VEC
@@ -528,16 +538,21 @@ void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
   }
 }
 
+// PSS. convolution 
 cf_t srslte_vec_dot_prod_ccc(cf_t *x, cf_t *y, uint32_t len) {
+#ifndef LV_HAVE_SSE
   uint32_t i;
   cf_t res = 0;
   for (i=0;i<len;i++) {
     res += x[i]*y[i];
   }
   return res;
+#else
+  return srslte_vec_dot_prod_ccc_simd(x, y, len); 
+#endif
 }
 
-// Convolution filter 
+// Convolution filter and in SSS search 
 cf_t srslte_vec_dot_prod_cfc(cf_t *x, float *y, uint32_t len) {
   uint32_t i;
   cf_t res = 0;
@@ -547,13 +562,19 @@ cf_t srslte_vec_dot_prod_cfc(cf_t *x, float *y, uint32_t len) {
   return res;
 }
 
+// SYNC 
 cf_t srslte_vec_dot_prod_conj_ccc(cf_t *x, cf_t *y, uint32_t len) {
+#ifndef LV_HAVE_SSE
   uint32_t i;
   cf_t res = 0;
   for (i=0;i<len;i++) {
-    res += x[i]*conjf(y[i]);
+    res += x[i]*y[i];
   }
   return res;
+#else
+  return srslte_vec_dot_prod_conj_ccc_simd(x, y, len); 
+#endif
+  
 }
 
 // PHICH 
@@ -583,7 +604,7 @@ float srslte_vec_avg_power_cf(cf_t *x, uint32_t len) {
   return crealf(srslte_vec_dot_prod_conj_ccc(x,x,len)) / len;
 }
 
-// PSS
+// PSS (disabled and using abs_square )
 void srslte_vec_abs_cf(cf_t *x, float *abs, uint32_t len) {
   int i;
   for (i=0;i<len;i++) {
diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c
index 16bbbc0bb..763c3285d 100644
--- a/srslte/lib/utils/vector_simd.c
+++ b/srslte/lib/utils/vector_simd.c
@@ -314,6 +314,38 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
 #endif
 }
 
+void srslte_vec_sub_fff_simd(float *x, float *y, float *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int points = len / 4;
+
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+  float* zPtr = (float*) z;
+
+  __m128 xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_loadu_ps(xPtr);
+    yVal = _mm_loadu_ps(yPtr);
+
+    zVal = _mm_sub_ps(xVal, yVal);
+
+    _mm_storeu_ps(zPtr, zVal);
+
+    xPtr += 4;
+    yPtr += 4;
+    zPtr += 4;
+  }
+
+  number = points * 4;
+  for(;number < len; number++){
+    z[number] = x[number] + y[number];
+  }
+#endif
+}
+
+
 #ifdef LV_HAVE_SSE
 static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
   __m128 yl, yh, tmp1, tmp2;
@@ -326,6 +358,97 @@ static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
 }
 #endif
 
+
+#ifdef LV_HAVE_SSE
+static inline __m128 _mm_complexmulconj_ps(__m128 x, __m128 y) {
+  const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+  y = _mm_xor_ps(y, conjugator); 
+  return _mm_complexmul_ps(x, y);
+}
+#endif
+
+cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len)
+{
+  cf_t result = 0; 
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int points = len / 2;
+
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+  
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  __m128 xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_loadu_ps(xPtr);
+    yVal = _mm_loadu_ps(yPtr);
+
+    zVal = _mm_complexmul_ps(xVal, yVal);
+
+    dotProdVal = _mm_add_ps(dotProdVal, zVal);
+
+    xPtr += 4;
+    yPtr += 4;
+  }
+  
+  cf_t dotProdVector[2];
+  _mm_storeu_ps((float*) dotProdVector, dotProdVal);
+  for (int i=0;i<2;i++) {
+    result += dotProdVector[i]; 
+  }
+
+  number = points * 2;
+  for(;number < len; number++){
+    result += (x[number] * y[number]);
+  }
+  
+#endif
+  return result; 
+}
+
+
+cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len)
+{
+  cf_t result = 0; 
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int points = len / 2;
+
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+  
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  __m128 xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_loadu_ps(xPtr);
+    yVal = _mm_loadu_ps(yPtr);
+
+    zVal = _mm_complexmulconj_ps(xVal, yVal);
+
+    dotProdVal = _mm_add_ps(dotProdVal, zVal);
+
+    xPtr += 4;
+    yPtr += 4;
+  }
+  
+  cf_t dotProdVector[2];
+  _mm_storeu_ps((float*) dotProdVector, dotProdVal);
+  for (int i=0;i<2;i++) {
+    result += dotProdVector[i]; 
+  }
+
+  number = points * 2;
+  for(;number < len; number++){
+    result += (x[number] * y[number]);
+  }
+  
+#endif
+  return result; 
+}
 void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) 
 {
 #ifdef LV_HAVE_SSE
@@ -355,13 +478,6 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
 #endif
 }
 
-#ifdef LV_HAVE_SSE
-static inline __m128 _mm_complexmulconj_ps(__m128 x, __m128 y) {
-  const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-  y = _mm_xor_ps(y, conjugator); 
-  return _mm_complexmul_ps(x, y);
-}
-#endif
 
 void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
@@ -454,6 +570,40 @@ void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len) {
 
 #endif
 }
+
+
+
+void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int quarterPoints = len / 4;
+
+  __m128 xVal, hVal, zVal;
+  float* zPtr = (float*) z;
+  const float* xPtr = (const float*) x;
+
+  // Set up constant scalar vector
+  hVal = _mm_set_ps1(h);
+  
+  for(;number < quarterPoints; number++){
+
+    xVal = _mm_loadu_ps(xPtr); 
+    zVal = _mm_mul_ps(xVal,hVal); 
+    _mm_storeu_ps(zPtr,zVal); 
+
+    xPtr += 4;
+    zPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < len; number++){
+    z[number] = x[number] * h;
+  }
+
+#endif
+}
+
+
 void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
   unsigned int number = 0;

From 932ed9b24714b0375ecdb4ee50892168f6b40906 Mon Sep 17 00:00:00 2001
From: Paul Sutton <paul@softwareradiosystems.com>
Date: Fri, 27 Jan 2017 11:56:59 +0000
Subject: [PATCH 09/55] Added GCC_ARCH option to manually set -march in GCC

---
 CMakeLists.txt              | 15 ++++++++-------
 cmake/modules/FindSSE.cmake |  8 ++++----
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae8e9c532..d36de57aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ configure_file(
 option(DisableMEX     "DisableMEX"     ON)
 option(StaticMKL      "StaticMKL"      OFF)
 option(DisableBladeRF "DisableBladeRF" OFF)
+set(GCC_ARCH native CACHE STRING "GCC compile for specific architecture.")
 
 ########################################################################
 # Install Dirs
@@ -101,24 +102,24 @@ endif(CMAKE_COMPILER_IS_GNUCXX)
 if(CMAKE_COMPILER_IS_GNUCC)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g")
 
+  find_package(SSE)
+
   if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    find_package(SSE)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0")
     if(HAVE_AVX)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx -DLV_HAVE_AVX -DLV_HAVE_SSE")      
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${GCC_ARCH} -mfpmath=sse -mavx -DLV_HAVE_AVX -DLV_HAVE_SSE")
     elseif(HAVE_SSE)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -msse4.1 -DLV_HAVE_SSE")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${GCC_ARCH} -mfpmath=sse -msse4.1 -DLV_HAVE_SSE")
     endif(HAVE_AVX)
   else(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
-    find_package(SSE)
     if (HAVE_AVX2)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx2 -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${GCC_ARCH} -mfpmath=sse -mavx2 -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
     else (HAVE_AVX2)
       if(HAVE_AVX)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -mavx -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${GCC_ARCH} -mfpmath=sse -mavx -Ofast -funroll-loops -DLV_HAVE_AVX -DLV_HAVE_SSE")
       elseif(HAVE_SSE)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mfpmath=sse -msse4.1 -Ofast -funroll-loops -DLV_HAVE_SSE")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${GCC_ARCH} -mfpmath=sse -msse4.1 -Ofast -funroll-loops -DLV_HAVE_SSE")
       endif(HAVE_AVX)
     endif (HAVE_AVX2)
     
diff --git a/cmake/modules/FindSSE.cmake b/cmake/modules/FindSSE.cmake
index 7b258f70f..9dbbeef3e 100644
--- a/cmake/modules/FindSSE.cmake
+++ b/cmake/modules/FindSSE.cmake
@@ -1,9 +1,9 @@
 
 include(CheckCSourceRuns)
 
-option(ENABLE_SSE "Enable compile-time SSE4.1 support." ON)
-option(ENABLE_AVX "Enable compile-time AVX support."  ON)
-option(ENABLE_AVX2 "Enable compile-time AVX2 support."  ON)
+option(ENABLE_SSE  "Enable compile-time SSE4.1 support." ON)
+option(ENABLE_AVX  "Enable compile-time AVX support."    ON)
+option(ENABLE_AVX2 "Enable compile-time AVX2 support."   ON)
 
 if (ENABLE_SSE)
     #
@@ -96,4 +96,4 @@ if (ENABLE_SSE)
 
 endif()
 
-mark_as_advanced(HAVE_SSE, HAVE_AVX, HAVE_AVX2)
\ No newline at end of file
+mark_as_advanced(HAVE_SSE, HAVE_AVX, HAVE_AVX2)

From 979a590dc92e2a83aac74eb9b7c270031528ff97 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 7 Apr 2017 14:54:24 +0200
Subject: [PATCH 10/55] comment references to uhd::register_handler

---
 srslte/lib/rf/uhd_c_api.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/srslte/lib/rf/uhd_c_api.cpp b/srslte/lib/rf/uhd_c_api.cpp
index 92af48f69..1b7772e68 100644
--- a/srslte/lib/rf/uhd_c_api.cpp
+++ b/srslte/lib/rf/uhd_c_api.cpp
@@ -9,6 +9,7 @@ extern "C" {
 #include "uhd_c_api.h"
 }
 
+/*
 #if UHD_VERSION < 31100
 static void (*handler)(const char*);
 
@@ -26,6 +27,7 @@ void rf_uhd_register_msg_handler_c(void (*new_handler)(const char*))
   uhd::msg::register_handler(translate_handler);
 #endif
 }
+*/
 
 void uhd_tx_metadata_set_time_spec(uhd_tx_metadata_handle *md, time_t secs, double frac_secs)
 {

From 8e440f512ab1ac9490aad73a5688882c31fc70e1 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 7 Apr 2017 15:06:51 +0200
Subject: [PATCH 11/55] comment references to uhd::register_handler

---
 srslte/lib/rf/uhd_c_api.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/srslte/lib/rf/uhd_c_api.cpp b/srslte/lib/rf/uhd_c_api.cpp
index 1b7772e68..93792722d 100644
--- a/srslte/lib/rf/uhd_c_api.cpp
+++ b/srslte/lib/rf/uhd_c_api.cpp
@@ -19,15 +19,17 @@ void translate_handler(uhd::msg::type_t type, const std::string & msg)
     handler(msg.c_str());
 }
 #endif
+*/
 
 void rf_uhd_register_msg_handler_c(void (*new_handler)(const char*)) 
 {
+/*
 #if UHD_VERSION < 31100
   handler = new_handler;
   uhd::msg::register_handler(translate_handler);
 #endif
-}
 */
+}
 
 void uhd_tx_metadata_set_time_spec(uhd_tx_metadata_handle *md, time_t secs, double frac_secs)
 {

From ec34d56e77082a66a01d45c825f7dce739c4d17e Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 8 Jun 2017 15:15:00 +0200
Subject: [PATCH 12/55] final merging changes

---
 CMakeLists.txt                            |  2 -
 lib/src/phy/rf/uhd_c_api.cpp              |  4 --
 lib/src/phy/utils/vector.c                | 20 ++++--
 lib/src/phy/utils/vector_simd.c           |  8 +--
 srslte/CMakeLists.txt                     | 84 ----------------------
 srslte/include/srslte/utils/vector_simd.h | 86 -----------------------
 6 files changed, 17 insertions(+), 187 deletions(-)
 delete mode 100644 srslte/CMakeLists.txt
 delete mode 100644 srslte/include/srslte/utils/vector_simd.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1cc2640e2..d7871d71d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -195,8 +195,6 @@ endmacro(ADD_CXX_COMPILER_FLAG_IF_AVAILABLE)
 if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${GCC_ARCH} -Wall -Wno-comment -Wno-reorder -Wno-unused-but-set-variable -Wno-unused-variable -std=c++03")
 
-  find_package(SSE)
-
   if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -DDEBUG_MODE")
   else(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
diff --git a/lib/src/phy/rf/uhd_c_api.cpp b/lib/src/phy/rf/uhd_c_api.cpp
index c0fb6bfd3..da348c17b 100644
--- a/lib/src/phy/rf/uhd_c_api.cpp
+++ b/lib/src/phy/rf/uhd_c_api.cpp
@@ -9,7 +9,6 @@ extern "C" {
 #include "uhd_c_api.h"
 }
 
-/*
 #if UHD_VERSION < 31100
 static void (*handler)(const char*);
 
@@ -19,16 +18,13 @@ void translate_handler(uhd::msg::type_t type, const std::string & msg)
     handler(msg.c_str());
 }
 #endif
-*/
 
 void rf_uhd_register_msg_handler_c(void (*new_handler)(const char*)) 
 {
-/*
 #if UHD_VERSION < 31100
   handler = new_handler;
   uhd::msg::register_handler(translate_handler);
 #endif
-*/
 }
 
 void uhd_tx_metadata_set_time_spec(uhd_tx_metadata_handle *md, time_t secs, double frac_secs)
diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c
index 3146c760d..daa03d0b4 100644
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@@ -57,12 +57,18 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {
 
 // Used in PRACH detector, AGC and chest_dl for noise averaging
 float srslte_vec_acc_ff(float *x, uint32_t len) {
-  int i;
-  float z=0;
-  for (i=0;i<len;i++) {
-    z+=x[i];
-  }
-  return z;
+#ifdef HAVE_VOLK_ACC_FUNCTION
+  float result;
+  volk_32f_accumulator_s32f(&result,x,len);
+  return result;
+#else
+   int i;
+   float z=0;
+   for (i=0;i<len;i++) {
+     z+=x[i];
+   }
+   return z;
+#endif
 }
 
 void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len) {
@@ -336,7 +342,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
  */
 void *srslte_vec_malloc(uint32_t size) {
   void *ptr;
-  if (posix_memalign(&ptr,64,size)) {
+  if (posix_memalign(&ptr,256,size)) {
     return NULL;
   } else {
     return ptr;
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index a1c1ef96b..e3428f66c 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -72,7 +72,7 @@ int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len)
   }
   
   short dotProdVector[8];
-  _mm_storeu_si128((__m128i*) dotProdVector, dotProdVal);
+  _mm_store_si128((__m128i*) dotProdVector, dotProdVal);
   for (int i=0;i<8;i++) {
     result += dotProdVector[i]; 
   }
@@ -140,12 +140,12 @@ void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_loadu_si128(xPtr);
-    yVal = _mm_loadu_si128(yPtr);
+    xVal = _mm_load_si128(xPtr);
+    yVal = _mm_load_si128(yPtr);
 
     zVal = _mm_add_epi16(xVal, yVal);
 
-    _mm_storeu_si128(zPtr, zVal); 
+    _mm_store_si128(zPtr, zVal); 
 
     xPtr ++;
     yPtr ++;
diff --git a/srslte/CMakeLists.txt b/srslte/CMakeLists.txt
deleted file mode 100644
index 1e86e01c3..000000000
--- a/srslte/CMakeLists.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-# Copyright 2013-2015 Software Radio Systems Limited
-#
-# This file is part of the srsLTE library.
-#
-# srsLTE is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of
-# the License, or (at your option) any later version.
-#
-# srsLTE is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# A copy of the GNU Affero General Public License can be found in
-# the LICENSE file in the top-level directory of this distribution
-# and at http://www.gnu.org/licenses/.
-#
-
-########################################################################
-# Install headers
-########################################################################
-INSTALL(DIRECTORY include/ 
-        DESTINATION "${INCLUDE_DIR}" 
-        FILES_MATCHING PATTERN "*.h"
-)
-
-########################################################################
-# Add headers to cmake project (useful for IDEs)
-########################################################################
-set(HEADERS_ALL "")
-file(GLOB headers *)
-FOREACH (_header ${headers})
-  if(IS_DIRECTORY ${_header})
-    file(GLOB_RECURSE tmp "${_header}/*.h")
-    list(APPEND HEADERS_ALL ${tmp})
-  endif(IS_DIRECTORY ${_header})
-ENDFOREACH()
-add_custom_target (add_srslte_headers SOURCES ${HEADERS_ALL})
-
-########################################################################
-# Find Dependencies
-########################################################################
-
-find_package(MKL)
-if(MKL_FOUND)
-  include_directories(${MKL_INCLUDE_DIRS})
-  link_directories(${MKL_LIBRARY_DIRS})
-else(MKL_FOUND)
-  find_package(FFTW3F REQUIRED)
-  if(FFTW3F_FOUND)
-    include_directories(${FFTW3F_INCLUDE_DIRS})
-    link_directories(${FFTW3F_LIBRARY_DIRS})
-  endif(FFTW3F_FOUND)
-endif(MKL_FOUND)
-
-find_package(UHD)
-if(UHD_FOUND)
-  include_directories(${UHD_INCLUDE_DIRS})
-  link_directories(${UHD_LIBRARY_DIRS})
-endif(UHD_FOUND)
-
-if(NOT DisableBladeRF) 
-  find_package(bladeRF)
-  if(BLADERF_FOUND)
-    include_directories(${BLADERF_INCLUDE_DIRS})
-    link_directories(${BLADERF_LIBRARY_DIRS})
-  endif(BLADERF_FOUND)
-endif(NOT DisableBladeRF)
-
-if(BLADERF_FOUND OR UHD_FOUND)
-  set(RF_FOUND TRUE CACHE INTERNAL "RF frontend found")
-else(BLADERF_FOUND OR UHD_FOUND)
-  set(RF_FOUND FALSE CACHE INTERNAL "RF frontend found")
-  add_definitions(-DDISABLE_RF)
-endif(BLADERF_FOUND OR UHD_FOUND)
-
-########################################################################
-# Add subdirectories
-########################################################################
-add_subdirectory(lib)
-add_subdirectory(include)
-add_subdirectory(examples)
diff --git a/srslte/include/srslte/utils/vector_simd.h b/srslte/include/srslte/utils/vector_simd.h
deleted file mode 100644
index 8380a75de..000000000
--- a/srslte/include/srslte/utils/vector_simd.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- *
- * \section COPYRIGHT
- *
- * Copyright 2013-2015 Software Radio Systems Limited
- *
- * \section LICENSE
- *
- * This file is part of the srsLTE library.
- *
- * srsLTE is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * srsLTE is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * A copy of the GNU Affero General Public License can be found in
- * the LICENSE file in the top-level directory of this distribution
- * and at http://www.gnu.org/licenses/.
- *
- */
-
-#ifndef VECTORSIMD_
-#define VECTORSIMD_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdio.h>
-#include <stdint.h>
-#include "srslte/config.h"
-
-SRSLTE_API int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len); 
-  
-SRSLTE_API void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len); 
-
-SRSLTE_API void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len); 
-
-SRSLTE_API void srslte_vec_sc_div2_sss_simd(short *x, int n_rightshift, short *z, uint32_t len); 
-
-SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len); 
-
-SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len); 
-
-SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, uint32_t len); 
-
-SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *abs_square, uint32_t len);
-
-SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *h, float *z, uint32_t len); 
-
-SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len); 
-
-SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, uint32_t len); 
-
-SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_sc_prod_cfc_simd(cf_t *x, float h, cf_t *z, uint32_t len); 
-
-SRSLTE_API float srslte_vec_acc_ff_simd(float *x, uint32_t len); 
-
-SRSLTE_API cf_t srslte_vec_dot_prod_cfc_simd(cf_t *x, float *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_convert_if_simd(int16_t *x, float *z, float scale, uint32_t len); 
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif

From 2bba9d187d32ce18fc0e752f423c637b10ab0335 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 7 Jul 2017 16:36:27 +0200
Subject: [PATCH 13/55] fixed dotprodconj. Removed unaligned load/store

---
 lib/src/phy/utils/vector.c      |  2 +-
 lib/src/phy/utils/vector_simd.c | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c
index 61d0271b6..8ceefe3da 100644
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@@ -608,7 +608,7 @@ cf_t srslte_vec_dot_prod_conj_ccc(cf_t *x, cf_t *y, uint32_t len) {
   uint32_t i;
   cf_t res = 0;
   for (i=0;i<len;i++) {
-    res += x[i]*y[i];
+    res += x[i]*conjf(y[i]);
   }
   return res;
 #else
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index c7bf27bf5..abbd2f7ff 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -206,12 +206,12 @@ void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_loadu_si128(xPtr);
-    yVal = _mm_loadu_si128(yPtr);
+    xVal = _mm_load_si128(xPtr);
+    yVal = _mm_load_si128(yPtr);
 
     zVal = _mm_sub_epi16(xVal, yVal);
 
-    _mm_storeu_si128(zPtr, zVal); 
+    _mm_store_si128(zPtr, zVal);
 
     xPtr ++;
     yPtr ++;
@@ -273,12 +273,12 @@ void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len)
   __m128i xVal, yVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_loadu_si128(xPtr);
-    yVal = _mm_loadu_si128(yPtr);
+    xVal = _mm_load_si128(xPtr);
+    yVal = _mm_load_si128(yPtr);
 
     zVal = _mm_mullo_epi16(xVal, yVal);
 
-    _mm_storeu_si128(zPtr, zVal); 
+    _mm_store_si128(zPtr, zVal);
 
     xPtr ++;
     yPtr ++;
@@ -341,11 +341,11 @@ void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len)
   __m128i xVal, zVal;
   for(;number < points; number++){
 
-    xVal = _mm_loadu_si128(xPtr);
+    xVal = _mm_load_si128(xPtr);
     
     zVal = _mm_srai_epi16(xVal, k);                 
       
-    _mm_storeu_si128(zPtr, zVal); 
+    _mm_store_si128(zPtr, zVal);
 
     xPtr ++;
     zPtr ++;

From 0dae4a00c44f006eae5ffd06d4773e9136816d7f Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 7 Jul 2017 18:04:59 +0200
Subject: [PATCH 14/55] missing return statement

---
 lib/include/srslte/common/metrics_hub.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/include/srslte/common/metrics_hub.h b/lib/include/srslte/common/metrics_hub.h
index 8443ef65a..9575347a2 100644
--- a/lib/include/srslte/common/metrics_hub.h
+++ b/lib/include/srslte/common/metrics_hub.h
@@ -34,6 +34,7 @@ public:
   bool init(metrics_interface<metrics_t> *m_, float report_period_secs=1.0) {
     m = m_; 
     start_periodic(report_period_secs*1e6);
+    return true;
   }
   void stop() {
     thread_cancel();
@@ -47,7 +48,7 @@ private:
   void run_period() {
     metrics_t metric; 
     m->get_metrics(metric);
-    for (int i=0;i<listeners.size();i++) {
+    for (uint32_t i=0;i<listeners.size();i++) {
       listeners[i]->set_metrics(metric);
     }
   }

From f629e10fcfdf70cbc43b0e7c00a4eb18ac15f08b Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 7 Jul 2017 18:44:17 +0200
Subject: [PATCH 15/55] fixed bug in sub_sse() and added couple of avx
 functions

---
 lib/include/srslte/phy/utils/vector_simd.h |  4 ++
 lib/src/phy/utils/vector.c                 | 12 +++-
 lib/src/phy/utils/vector_simd.c            | 65 +++++++++++++++++++++-
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h
index 6fe55f89d..1010cbed6 100644
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@@ -49,8 +49,12 @@ SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t l
 
 SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len);
 
+SRSLTE_API void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len);
+
 SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len);
 
+SRSLTE_API void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len);
+
 SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len);
 
 SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len);
diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c
index 8ceefe3da..2a2001ae3 100644
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@@ -101,9 +101,13 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
   for (i=0;i<len;i++) {
     z[i] = x[i]-y[i];
   }
+#else
+#ifdef LV_HAVE_AVX
+  srslte_vec_sub_fff_avx(x, y, z, len);
 #else
   srslte_vec_sub_fff_sse(x, y, z, len);
 #endif
+#endif
 }
 
 void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
@@ -134,7 +138,11 @@ void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
     z[i] = x[i]+y[i];
   }
 #else
-  srslte_vec_sum_fff_sse(x, y, z, len);
+  #ifdef LV_HAVE_AVX
+    srslte_vec_sum_fff_avx(x, y, z, len);
+  #else
+    srslte_vec_sum_fff_sse(x, y, z, len);
+  #endif
 #endif
 }
 
@@ -246,7 +254,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
   for (i=0;i<len;i++) {
     z[i] = x[i]*h;
   }
-#endif  
+#endif
 }
 
 
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index abbd2f7ff..d38373d80 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -501,6 +501,36 @@ void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
 #endif
 }
 
+void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
+#ifdef LV_HAVE_AVX
+  unsigned int number = 0;
+  const unsigned int points = len / 8;
+
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+  float* zPtr = (float*) z;
+
+  __m256 xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm256_loadu_ps(xPtr);
+    yVal = _mm256_loadu_ps(yPtr);
+
+    zVal = _mm256_add_ps(xVal, yVal);
+
+    _mm256_storeu_ps(zPtr, zVal);
+
+    xPtr += 8;
+    yPtr += 8;
+    zPtr += 8;
+  }
+
+  for(number = points * 8;number < len; number++){
+    z[number] = x[number] + y[number];
+  }
+#endif
+}
+
 void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
 #ifdef LV_HAVE_SSE
   unsigned int number = 0;
@@ -525,14 +555,43 @@ void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
     zPtr += 4;
   }
 
-  number = points * 4;
-  for(;number < len; number++){
-    z[number] = x[number] + y[number];
+  for(number = points * 4;number < len; number++){
+    z[number] = x[number] - y[number];
   }
 #endif
 }
 
 
+void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len) {
+#ifdef LV_HAVE_SSE
+  unsigned int number = 0;
+  const unsigned int points = len / 8;
+
+  const float* xPtr = (const float*) x;
+  const float* yPtr = (const float*) y;
+  float* zPtr = (float*) z;
+
+  __m256 xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm256_loadu_ps(xPtr);
+    yVal = _mm256_loadu_ps(yPtr);
+
+    zVal = _mm256_sub_ps(xVal, yVal);
+
+    _mm256_storeu_ps(zPtr, zVal);
+
+    xPtr += 8;
+    yPtr += 8;
+    zPtr += 8;
+  }
+
+  for(number = points * 8;number < len; number++){
+    z[number] = x[number] - y[number];
+  }
+#endif
+}
+
 #ifdef LV_HAVE_SSE
 static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
   __m128 yl, yh, tmp1, tmp2;

From 012d14f4b50441f823e59671ceab9e1e233d06b1 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Tue, 11 Jul 2017 13:17:26 +0200
Subject: [PATCH 16/55] fixed segfault due to race condition in scrambling
 sequence pre-generation

---
 lib/include/srslte/phy/phch/pdsch.h |  3 ++-
 lib/include/srslte/phy/phch/pucch.h |  3 ++-
 lib/include/srslte/phy/phch/pusch.h |  3 ++-
 lib/src/phy/phch/pdsch.c            | 24 +++++++++++----------
 lib/src/phy/phch/pucch.c            |  8 ++++---
 lib/src/phy/phch/pusch.c            | 33 +++++++++++++++--------------
 6 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/lib/include/srslte/phy/phch/pdsch.h b/lib/include/srslte/phy/phch/pdsch.h
index 7730d2fa1..ad01c4ef8 100644
--- a/lib/include/srslte/phy/phch/pdsch.h
+++ b/lib/include/srslte/phy/phch/pdsch.h
@@ -48,7 +48,8 @@
 #include "srslte/phy/phch/pdsch_cfg.h"
 
 typedef struct {
-  srslte_sequence_t seq[SRSLTE_NSUBFRAMES_X_FRAME];  
+  srslte_sequence_t seq[SRSLTE_NSUBFRAMES_X_FRAME];
+  bool sequence_generated;
 } srslte_pdsch_user_t;
 
 /* PDSCH object */
diff --git a/lib/include/srslte/phy/phch/pucch.h b/lib/include/srslte/phy/phch/pucch.h
index 3542dc53f..56d512418 100644
--- a/lib/include/srslte/phy/phch/pucch.h
+++ b/lib/include/srslte/phy/phch/pucch.h
@@ -80,7 +80,8 @@ typedef struct SRSLTE_API {
 } srslte_pucch_cfg_t;
 
 typedef struct  {
-  srslte_sequence_t seq_f2[SRSLTE_NSUBFRAMES_X_FRAME];   
+  srslte_sequence_t seq_f2[SRSLTE_NSUBFRAMES_X_FRAME];
+  bool sequence_generated;
 } srslte_pucch_user_t; 
 
 /* PUCCH object */
diff --git a/lib/include/srslte/phy/phch/pusch.h b/lib/include/srslte/phy/phch/pusch.h
index bf04a4781..e5ee43995 100644
--- a/lib/include/srslte/phy/phch/pusch.h
+++ b/lib/include/srslte/phy/phch/pusch.h
@@ -61,7 +61,8 @@ typedef struct {
 } srslte_pusch_hopping_cfg_t;
 
 typedef struct {
-  srslte_sequence_t seq[SRSLTE_NSUBFRAMES_X_FRAME];  
+  srslte_sequence_t seq[SRSLTE_NSUBFRAMES_X_FRAME];
+  bool sequences_generated;
 } srslte_pusch_user_t; 
 
 /* PUSCH object */
diff --git a/lib/src/phy/phch/pdsch.c b/lib/src/phy/phch/pdsch.c
index 8791ac8e7..9b6128c64 100644
--- a/lib/src/phy/phch/pdsch.c
+++ b/lib/src/phy/phch/pdsch.c
@@ -32,6 +32,7 @@
 #include <stdbool.h>
 #include <assert.h>
 #include <math.h>
+#include <srslte/phy/phch/pdsch.h>
 
 #include "prb_dl.h"
 #include "srslte/phy/phch/pdsch.h"
@@ -362,6 +363,7 @@ int srslte_pdsch_set_rnti(srslte_pdsch_t *q, uint16_t rnti) {
           return SRSLTE_ERROR; 
         }
       }
+      q->users[rnti]->sequence_generated = true;
     }
   }
   return SRSLTE_SUCCESS;
@@ -467,15 +469,15 @@ int srslte_pdsch_decode_multi(srslte_pdsch_t *q,
     srslte_demod_soft_demodulate_s(cfg->grant.mcs.mod, q->d, q->e, cfg->nbits.nof_re);
     
     /* descramble */
-    if (!q->users[rnti]) {
-      srslte_sequence_t seq; 
+    if (q->users[rnti] && q->users[rnti]->sequence_generated) {
+      srslte_scrambling_s_offset(&q->users[rnti]->seq[cfg->sf_idx], q->e, 0, cfg->nbits.nof_bits);
+    } else {
+      srslte_sequence_t seq;
       if (srslte_sequence_pdsch(&seq, rnti, 0, 2 * cfg->sf_idx, q->cell.id, cfg->nbits.nof_bits)) {
-        return SRSLTE_ERROR; 
+        return SRSLTE_ERROR;
       }
-      srslte_scrambling_s_offset(&seq, q->e, 0, cfg->nbits.nof_bits);      
+      srslte_scrambling_s_offset(&seq, q->e, 0, cfg->nbits.nof_bits);
       srslte_sequence_free(&seq);
-    } else {    
-      srslte_scrambling_s_offset(&q->users[rnti]->seq[cfg->sf_idx], q->e, 0, cfg->nbits.nof_bits);      
     }
 
     if (SRSLTE_VERBOSE_ISDEBUG()) {
@@ -537,15 +539,15 @@ int srslte_pdsch_encode(srslte_pdsch_t *q,
     }
 
     /* scramble */
-    if (!q->users[rnti]) {
-      srslte_sequence_t seq; 
+    if (q->users[rnti] && q->users[rnti]->sequence_generated) {
+      srslte_scrambling_bytes(&q->users[rnti]->seq[cfg->sf_idx], (uint8_t*) q->e, cfg->nbits.nof_bits);
+    } else {
+      srslte_sequence_t seq;
       if (srslte_sequence_pdsch(&seq, rnti, 0, 2 * cfg->sf_idx, q->cell.id, cfg->nbits.nof_bits)) {
-        return SRSLTE_ERROR; 
+        return SRSLTE_ERROR;
       }
       srslte_scrambling_bytes(&seq, (uint8_t*) q->e, cfg->nbits.nof_bits);
       srslte_sequence_free(&seq);
-    } else {    
-      srslte_scrambling_bytes(&q->users[rnti]->seq[cfg->sf_idx], (uint8_t*) q->e, cfg->nbits.nof_bits);
     }
     
     srslte_mod_modulate_bytes(&q->mod[cfg->grant.mcs.mod], (uint8_t*) q->e, q->d, cfg->nbits.nof_bits);
diff --git a/lib/src/phy/phch/pucch.c b/lib/src/phy/phch/pucch.c
index c58f69871..6a889b89c 100644
--- a/lib/src/phy/phch/pucch.c
+++ b/lib/src/phy/phch/pucch.c
@@ -33,6 +33,7 @@
 #include <assert.h>
 #include <math.h>
 #include <complex.h>
+#include <srslte/srslte.h>
 
 #include "srslte/phy/ch_estimation/refsignal_ul.h"
 #include "srslte/phy/phch/pucch.h"
@@ -489,7 +490,7 @@ void srslte_pucch_clear_rnti(srslte_pucch_t *q, uint16_t rnti) {
 
 int srslte_pucch_set_crnti(srslte_pucch_t *q, uint16_t rnti) {
   if (!q->users[rnti]) {
-    q->users[rnti] = malloc(sizeof(srslte_pucch_user_t));
+    q->users[rnti] = calloc(1, sizeof(srslte_pucch_user_t));
     if (q->users[rnti]) {
       for (uint32_t sf_idx=0;sf_idx<SRSLTE_NSUBFRAMES_X_FRAME;sf_idx++) {
         // Precompute scrambling sequence for pucch format 2    
@@ -498,6 +499,7 @@ int srslte_pucch_set_crnti(srslte_pucch_t *q, uint16_t rnti) {
           return SRSLTE_ERROR; 
         }        
       }
+      q->users[rnti]->sequence_generated = true;
     }
   }
   return SRSLTE_SUCCESS; 
@@ -591,7 +593,7 @@ static int uci_mod_bits(srslte_pucch_t *q, srslte_pucch_format_t format, uint8_t
     case SRSLTE_PUCCH_FORMAT_2:
     case SRSLTE_PUCCH_FORMAT_2A:
     case SRSLTE_PUCCH_FORMAT_2B:
-      if (q->users[rnti]) {
+      if (q->users[rnti] && q->users[rnti]->sequence_generated) {
         memcpy(q->bits_scram, bits, SRSLTE_PUCCH2_NOF_BITS*sizeof(uint8_t));
         srslte_scrambling_b(&q->users[rnti]->seq_f2[sf_idx], q->bits_scram);
         srslte_mod_modulate(&q->mod, q->bits_scram, q->d, SRSLTE_PUCCH2_NOF_BITS);
@@ -796,7 +798,7 @@ int srslte_pucch_decode(srslte_pucch_t* q, srslte_pucch_format_t format,
       case SRSLTE_PUCCH_FORMAT_2:
       case SRSLTE_PUCCH_FORMAT_2A:
       case SRSLTE_PUCCH_FORMAT_2B:
-        if (q->users[rnti]) {
+        if (q->users[rnti] && q->users[rnti]->sequence_generated) {
           pucch_encode_(q, format, n_pucch, sf_idx, rnti, NULL, ref, true);
           srslte_vec_prod_conj_ccc(q->z, ref, q->z_tmp, SRSLTE_PUCCH_MAX_SYMBOLS);
           for (int i=0;i<SRSLTE_PUCCH2_NOF_BITS/2;i++) {
diff --git a/lib/src/phy/phch/pusch.c b/lib/src/phy/phch/pusch.c
index 5e1cbd922..9e0df307b 100644
--- a/lib/src/phy/phch/pusch.c
+++ b/lib/src/phy/phch/pusch.c
@@ -400,7 +400,8 @@ int srslte_pusch_set_rnti(srslte_pusch_t *q, uint16_t rnti) {
             q->max_re * srslte_mod_bits_x_symbol(SRSLTE_MOD_64QAM))) {
           return SRSLTE_ERROR; 
         }
-      }      
+      }
+      q->users[rnti]->sequences_generated = true;
     }
   }
   return SRSLTE_SUCCESS;
@@ -444,15 +445,15 @@ int srslte_pusch_encode(srslte_pusch_t *q, srslte_pusch_cfg_t *cfg, srslte_softb
       return SRSLTE_ERROR;
     }
 
-    if (!q->users[rnti]) {
-      srslte_sequence_t seq; 
-      if (srslte_sequence_pusch(&seq, rnti, 2 * cfg->sf_idx, q->cell.id, cfg->nbits.nof_bits)) {
-        return SRSLTE_ERROR; 
-      }
-      srslte_scrambling_bytes(&seq, (uint8_t*) q->q, cfg->nbits.nof_bits);      
-      srslte_sequence_free(&seq);
+    if (q->users[rnti] && q->users[rnti]->sequences_generated) {
+      srslte_scrambling_bytes(&q->users[rnti]->seq[cfg->sf_idx], (uint8_t*) q->q, cfg->nbits.nof_bits);
     } else {
-      srslte_scrambling_bytes(&q->users[rnti]->seq[cfg->sf_idx], (uint8_t*) q->q, cfg->nbits.nof_bits);            
+      srslte_sequence_t seq;
+      if (srslte_sequence_pusch(&seq, rnti, 2 * cfg->sf_idx, q->cell.id, cfg->nbits.nof_bits)) {
+        return SRSLTE_ERROR;
+      }
+      srslte_scrambling_bytes(&seq, (uint8_t*) q->q, cfg->nbits.nof_bits);
+      srslte_sequence_free(&seq);
     }
     
     // Correct UCI placeholder/repetition bits    
@@ -535,13 +536,13 @@ int srslte_pusch_decode(srslte_pusch_t *q,
     srslte_sequence_t *seq = NULL;
 
     // Create sequence if does not exist
-    if (!q->users[rnti]) {
-      seq = &q->tmp_seq; 
-      if (srslte_sequence_pusch(seq, rnti, 2 * cfg->sf_idx, q->cell.id, cfg->nbits.nof_bits)) {
-        return SRSLTE_ERROR; 
-      }
+    if (q->users[rnti] && q->users[rnti]->sequences_generated) {
+      seq = &q->users[rnti]->seq[cfg->sf_idx];
     } else {
-      seq = &q->users[rnti]->seq[cfg->sf_idx]; 
+      seq = &q->tmp_seq;
+      if (srslte_sequence_pusch(seq, rnti, 2 * cfg->sf_idx, q->cell.id, cfg->nbits.nof_bits)) {
+        return SRSLTE_ERROR;
+      }
     }
     
     // Decode RI/HARQ bits before descrambling 
@@ -553,7 +554,7 @@ int srslte_pusch_decode(srslte_pusch_t *q,
     // Descrambling
     srslte_scrambling_s_offset(seq, q->q, 0, cfg->nbits.nof_bits);
         
-    if (!q->users[rnti]) {
+    if (!(q->users[rnti] && q->users[rnti]->sequences_generated)) {
       srslte_sequence_free(seq);
     }
     

From 2b775462f72602701af800ffe57376d183de53ff Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Mon, 25 Sep 2017 13:07:01 +0200
Subject: [PATCH 17/55] Added LV_HAVE_AVX512 to CMakeLists

---
 CMakeLists.txt              |  5 ++++
 cmake/modules/FindSSE.cmake | 46 +++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d354a5497..40d3ef4da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -282,6 +282,11 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
     endif(HAVE_AVX)
   endif (HAVE_AVX2)
 
+  if (HAVE_AVX512)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -DLV_HAVE_AVX512")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DLV_HAVE_AVX512")
+  endif(HAVE_AVX512)
+
   if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     if(HAVE_SSE)
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Ofast -funroll-loops")
diff --git a/cmake/modules/FindSSE.cmake b/cmake/modules/FindSSE.cmake
index de8b38d1d..4c9673a9d 100644
--- a/cmake/modules/FindSSE.cmake
+++ b/cmake/modules/FindSSE.cmake
@@ -4,10 +4,11 @@
 
 include(CheckCSourceRuns)
 
-option(ENABLE_SSE  "Enable compile-time SSE4.1 support." ON)
-option(ENABLE_AVX  "Enable compile-time AVX support."    ON)
-option(ENABLE_AVX2 "Enable compile-time AVX2 support."   ON)
-option(ENABLE_FMA "Enable compile-time FMA support."     ON)
+option(ENABLE_SSE    "Enable compile-time SSE4.1 support." ON)
+option(ENABLE_AVX    "Enable compile-time AVX support."    ON)
+option(ENABLE_AVX2   "Enable compile-time AVX2 support."   ON)
+option(ENABLE_FMA    "Enable compile-time FMA support."    ON)
+option(ENABLE_AVX512 "Enable compile-time AVX512 support." ON)
 
 if (ENABLE_SSE)
     #
@@ -135,6 +136,41 @@ if (ENABLE_SSE)
         endif()
     endif()
 
+    if (ENABLE_AVX512)
+
+        #
+        # Check compiler for AVX intrinsics
+        #
+        if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANG )
+            set(CMAKE_REQUIRED_FLAGS "-mavx512f")
+            check_c_source_runs("
+          #include <immintrin.h>
+          int main()
+          {
+            __m512i a, b, c;
+            const int src[16] = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8 , 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF};
+            int dst[16];
+            a =  _mm512_loadu_si512( (__m512i*)src );
+            b =  _mm512_loadu_si512( (__m512i*)src );
+            c = _mm512_add_epi32( a, b );
+            _mm512_storeu_si512( (__m512i*)dst, c );
+            int i = 0;
+            for( i = 0; i < 16; i++ ){
+              if( ( src[i] + src[i] ) != dst[i] ){
+                return -1;
+              }
+            }
+            return 0;
+          }"
+                    HAVE_AVX512)
+        endif()
+
+        if (HAVE_AVX512)
+            message(STATUS "AVX512 is enabled - target CPU must support it")
+        endif()
+    endif()
+
+
 endif()
 
-mark_as_advanced(HAVE_SSE, HAVE_AVX, HAVE_AVX2, HAVE_FMA)
+mark_as_advanced(HAVE_SSE, HAVE_AVX, HAVE_AVX2, HAVE_FMA, HAVE_AVX512)

From 8078238cb59624d6c7ef5eec87195f361092c9ea Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Mon, 25 Sep 2017 13:08:38 +0200
Subject: [PATCH 18/55] Removed test macros from mat.h

---
 lib/include/srslte/phy/utils/mat.h |  8 +-------
 lib/src/phy/utils/mat.c            |  1 +
 lib/src/phy/utils/test/mat_test.c  | 32 +++++++++++++++++++++++++++---
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/lib/include/srslte/phy/utils/mat.h b/lib/include/srslte/phy/utils/mat.h
index d960590c4..942559955 100644
--- a/lib/include/srslte/phy/utils/mat.h
+++ b/lib/include/srslte/phy/utils/mat.h
@@ -27,14 +27,8 @@
 #ifndef SRSLTE_MAT_H
 #define SRSLTE_MAT_H
 
-#include "srslte/phy/utils/simd.h"
 #include "srslte/config.h"
-
-
-/*
- * Generic Macros
- */
-#define RANDOM_CF() (((float)rand())/((float)RAND_MAX) + _Complex_I*((float)rand())/((float)RAND_MAX))
+#include "srslte/phy/utils/simd.h"
 
 /* Generic implementation for complex reciprocal */
 SRSLTE_API cf_t srslte_mat_cf_recip_gen(cf_t a);
diff --git a/lib/src/phy/utils/mat.c b/lib/src/phy/utils/mat.c
index 439daa2ce..bbfc38135 100644
--- a/lib/src/phy/utils/mat.c
+++ b/lib/src/phy/utils/mat.c
@@ -27,6 +27,7 @@
 #include <complex.h>
 #include <math.h>
 
+#include <srslte/config.h>
 #include "srslte/phy/utils/mat.h"
 
 
diff --git a/lib/src/phy/utils/test/mat_test.c b/lib/src/phy/utils/test/mat_test.c
index 49be5c9ae..46081da98 100644
--- a/lib/src/phy/utils/test/mat_test.c
+++ b/lib/src/phy/utils/test/mat_test.c
@@ -33,12 +33,18 @@
 #include <sys/time.h>
 
 #include "srslte/phy/utils/mat.h"
+#include "srslte/phy/utils/simd.h"
+#include "srslte/phy/utils/vector.h"
 
 
 bool zf_solver = false;
 bool mmse_solver = false;
 bool verbose = false;
 
+#define RANDOM_F() ((float)rand())/((float)RAND_MAX)
+#define RANDOM_S() ((int16_t)(rand() && 0x800F))
+#define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F())
+
 double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
   if (ts_end->tv_usec > ts_start->tv_usec) {
     return ((double) ts_end->tv_sec - (double) ts_start->tv_sec) * 1000000 +
@@ -49,16 +55,16 @@ double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
   }
 }
 
-#define NOF_REPETITIONS 1000
+#define BLOCK_SIZE 1000
 #define RUN_TEST(FUNCTION) /*TYPE NAME (void)*/ { \
   int i;\
   struct timeval start, end;\
   gettimeofday(&start, NULL); \
   bool ret = true; \
-  for (i = 0; i < NOF_REPETITIONS; i++) {ret &= FUNCTION ();}\
+  for (i = 0; i < BLOCK_SIZE; i++) {ret &= FUNCTION ();}\
   gettimeofday(&end, NULL);\
   if (verbose) printf("%32s: %s ... %6.2f us/call\n", #FUNCTION, (ret)?"Pass":"Fail", \
-                      elapsed_us(&start, &end)/NOF_REPETITIONS);\
+                      elapsed_us(&start, &end)/BLOCK_SIZE);\
   passed &= ret;\
 }
 
@@ -373,6 +379,24 @@ bool test_mmse_solver_avx(void) {
 
 #endif /* LV_HAVE_AVX */
 
+bool test_vec_dot_prod_ccc(void) {
+  __attribute__((aligned(256))) cf_t a[14];
+  __attribute__((aligned(256))) cf_t b[14];
+  cf_t res = 0, gold = 0;
+
+  for (int i = 0; i < 14; i++) {
+    a[i] = RANDOM_CF();
+    b[i] = RANDOM_CF();
+  }
+
+  res = srslte_vec_dot_prod_ccc(a, b, 14);
+
+  for (int i=0;i<14;i++) {
+    gold += a[i]*b[i];
+  }
+
+  return (cabsf(res - gold) < 1e-3);
+}
 
 int main(int argc, char **argv) {
   bool passed = true;
@@ -405,6 +429,8 @@ int main(int argc, char **argv) {
 #endif /* LV_HAVE_AVX */
   }
 
+  RUN_TEST(test_vec_dot_prod_ccc);
+
   printf("%s!\n", (passed) ? "Ok" : "Failed");
 
   if (!passed) {

From 1c3b5552be004064e16527fba55e2d419a1143b2 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Mon, 25 Sep 2017 13:15:59 +0200
Subject: [PATCH 19/55] added c16 type and architecture independent inline SIMD
 calls

---
 lib/include/srslte/config.h         |   1 +
 lib/include/srslte/phy/utils/simd.h | 833 +++++++++++++++++++++++++++-
 2 files changed, 832 insertions(+), 2 deletions(-)

diff --git a/lib/include/srslte/config.h b/lib/include/srslte/config.h
index 68076c0c8..8a988a971 100644
--- a/lib/include/srslte/config.h
+++ b/lib/include/srslte/config.h
@@ -59,5 +59,6 @@
 
 // cf_t definition
 typedef _Complex float cf_t;
+typedef _Complex short int c16_t;
 
 #endif // CONFIG_H
diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 420d07213..774dd54bd 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -27,6 +27,8 @@
 #ifndef SRSLTE_SIMD_H_H
 #define SRSLTE_SIMD_H_H
 
+#include <immintrin.h>
+
 /*
  * SSE Macros
  */
@@ -44,7 +46,7 @@
 /*
  * AVX Macros
  */
-#ifdef LV_HAVE_AVX
+#ifdef LV_HAVE_AVX2
 
 #define _MM256_MULJ_PS(X) _mm256_permute_ps(_MM256_CONJ_PS(X), 0b10110001)
 #define _MM256_CONJ_PS(X) (_mm256_xor_ps(X, _mm256_set_ps(-0.0f, 0.0f, -0.0f, 0.0f, -0.0f, 0.0f, -0.0f, 0.0f)))
@@ -60,7 +62,7 @@
 #define _MM256_PROD_PS(a, b) _mm256_addsub_ps(_mm256_mul_ps(a,_mm256_moveldup_ps(b)),\
                               _mm256_mul_ps(_mm256_shuffle_ps(a,a,0xB1),_mm256_movehdup_ps(b)))
 #endif /* LV_HAVE_FMA */
-#endif /* LV_HAVE_AVX */
+#endif /* LV_HAVE_AVX2 */
 
 
 /*
@@ -78,4 +80,831 @@
                               _mm256_fmsubadd_ps(_mm256_shuffle_ps(A,A,0xB1),_mm256_movehdup_ps(B), C))
 #endif /* LV_HAVE_FMA */
 
+
+
+/* Memory Sizes for Single Floating Point and fixed point */
+#ifdef LV_HAVE_AVX512
+
+#define SRSLTE_SIMD_F_SIZE    16
+#define SRSLTE_SIMD_CF_SIZE   16
+
+#define SRSLTE_SIMD_S_SIZE    32
+#define SRSLTE_SIMD_C16_SIZE  0
+
+#else
+#ifdef LV_HAVE_AVX2
+
+#define SRSLTE_SIMD_F_SIZE    8
+#define SRSLTE_SIMD_CF_SIZE   8
+
+#define SRSLTE_SIMD_S_SIZE    16
+#define SRSLTE_SIMD_C16_SIZE  16
+
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+
+#define SRSLTE_SIMD_F_SIZE    4
+#define SRSLTE_SIMD_CF_SIZE   4
+
+#define SRSLTE_SIMD_S_SIZE    8
+#define SRSLTE_SIMD_C16_SIZE  8
+
+#else /* LV_HAVE_SSE */
+
+#define SRSLTE_SIMD_F_SIZE    0
+#define SRSLTE_SIMD_CF_SIZE   0
+
+#define SRSLTE_SIMD_S_SIZE    0
+#define SRSLTE_SIMD_C16_SIZE  0
+
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+
+
+
+#if SRSLTE_SIMD_F_SIZE
+
+/* Data types */
+#ifdef LV_HAVE_AVX512
+typedef __m512 simd_f_t;
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+typedef __m256 simd_f_t;
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+typedef __m128 simd_f_t;
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+
+/* Single precision Floating point functions */
+static inline simd_f_t srslte_simd_f_load(float *ptr) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_load_ps(ptr);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_load_ps(ptr);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_load_ps(ptr);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_loadu(float *ptr) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_loadu_ps(ptr);
+#else /* LV_HAVE_AVX512 */
+  #ifdef LV_HAVE_AVX2
+  return _mm256_loadu_ps(ptr);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_loadu_ps(ptr);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_store_ps(ptr, simdreg);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_store_ps(ptr, simdreg);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  _mm_store_ps(ptr, simdreg);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_f_storeu(float *ptr, simd_f_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_storeu_ps(ptr, simdreg);
+#else /* LV_HAVE_AVX512 */
+  #ifdef LV_HAVE_AVX2
+  _mm256_storeu_ps(ptr, simdreg);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  _mm_storeu_ps(ptr, simdreg);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_set1(float x) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_set1_ps(x);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_set1_ps(x);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_set1_ps(x);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_mul_ps(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_mul_ps(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_mul_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  __m512 r = _mm512_add_ps(a, b);
+  return _mm512_mask_sub_ps(r, 0b1010101010101010, a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_addsub_ps(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_addsub_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_sub(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_sub_ps(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_sub_ps(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_sub_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_add(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_add_ps(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_add_ps(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_add_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_zero (void) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_setzero_ps();
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+ return _mm256_setzero_ps();
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+ return _mm_setzero_ps();
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_swap(simd_f_t a) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_permute_ps(a, 0b10110001);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_permute_ps(a, 0b10110001);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return _mm_shuffle_ps(a, a, 0b10110001);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_hadd(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  const __m512i idx1 = _mm512_setr_epi32((0b00000), (0b00010),
+                                         (0b00100), (0b00110),
+                                         (0b01000), (0b01010),
+                                         (0b01100), (0b01110),
+                                         (0b10000), (0b10010),
+                                         (0b10100), (0b10110),
+                                         (0b11000), (0b11010),
+                                         (0b11100), (0b11110));
+  const __m512i idx2 = _mm512_or_epi32(idx1, _mm512_set1_epi32(1));
+
+  simd_f_t a1 = _mm512_permutex2var_ps(a, idx1, b);
+  simd_f_t b1 = _mm512_permutex2var_ps(a, idx2, b);
+  return _mm512_add_ps(a1, b1);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  simd_f_t a1 = _mm256_permute2f128_ps(a, b, 0b00100000);
+  simd_f_t b1 = _mm256_permute2f128_ps(a, b, 0b00110001);
+  return _mm256_hadd_ps(a1, b1);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return _mm_hadd_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_sqrt_ps(a);
+#else /* LV_HAVE_AVX512 */
+  #ifdef LV_HAVE_AVX2
+  return _mm256_sqrt_ps(a);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return _mm_sqrt_ps(a);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+#endif /* SRSLTE_SIMD_F_SIZE */
+
+
+#if SRSLTE_SIMD_CF_SIZE
+
+typedef struct {
+  simd_f_t re;
+  simd_f_t im;
+} simd_cf_t;
+
+/* Complex Single precission Floating point functions */
+static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  __m512 in1 = _mm512_permute_ps(_mm512_load_ps((float*)(ptr)), 0b11011000);
+  __m512 in2 = _mm512_permute_ps(_mm512_load_ps((float*)(ptr + 8)), 0b11011000);
+  ret.re = _mm512_unpacklo_ps(in1, in2);
+  ret.im = _mm512_unpackhi_ps(in1, in2);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  __m256 in1 = _mm256_permute_ps(_mm256_load_ps((float*)(ptr)), 0b11011000);
+  __m256 in2 = _mm256_permute_ps(_mm256_load_ps((float*)(ptr + 4)), 0b11011000);
+  ret.re = _mm256_unpacklo_ps(in1, in2);
+  ret.im = _mm256_unpackhi_ps(in1, in2);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  __m128 i1 = _mm_load_ps((float*)(ptr));
+  __m128 i2 = _mm_load_ps((float*)(ptr + 2));
+  ret.re = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(2,0,2,0));
+  ret.im = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(3,1,3,1));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+/* Complex Single precission Floating point functions */
+static inline simd_cf_t srslte_simd_cfi_loadu(cf_t *ptr) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  __m512 in1 = _mm512_permute_ps(_mm512_loadu_ps((float*)(ptr)), 0b11011000);
+  __m512 in2 = _mm512_permute_ps(_mm512_loadu_ps((float*)(ptr + 8)), 0b11011000);
+  ret.re = _mm512_unpacklo_ps(in1, in2);
+  ret.im = _mm512_unpackhi_ps(in1, in2);
+#else /* LV_HAVE_AVX512 */
+  #ifdef LV_HAVE_AVX2
+  __m256 in1 = _mm256_permute_ps(_mm256_loadu_ps((float*)(ptr)), 0b11011000);
+  __m256 in2 = _mm256_permute_ps(_mm256_loadu_ps((float*)(ptr + 4)), 0b11011000);
+  ret.re = _mm256_unpacklo_ps(in1, in2);
+  ret.im = _mm256_unpackhi_ps(in1, in2);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  __m128 i1 = _mm_loadu_ps((float*)(ptr));
+  __m128 i2 = _mm_loadu_ps((float*)(ptr + 2));
+  ret.re = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(2,0,2,0));
+  ret.im = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(3,1,3,1));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_load(float *re, float *im) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_load_ps(re);
+  ret.im = _mm512_load_ps(im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_load_ps(re);
+  ret.im = _mm256_load_ps(im);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_load_ps(re);
+  ret.im = _mm_load_ps(im);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_loadu(float *re, float *im) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_loadu_ps(re);
+  ret.im = _mm512_loadu_ps(im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_loadu_ps(re);
+  ret.im = _mm256_loadu_ps(im);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_loadu_ps(re);
+  ret.im = _mm_loadu_ps(im);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline void srslte_simd_cfi_store(cf_t *ptr, simd_cf_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  __m512 out1 = _mm512_permute_ps(simdreg.re, 0b11011000);
+  __m512 out2 = _mm512_permute_ps(simdreg.im, 0b11011000);
+  _mm512_store_ps((float*)(ptr), _mm512_unpacklo_ps(out1, out2));
+  _mm512_store_ps((float*)(ptr + 8), _mm512_unpackhi_ps(out1, out2));
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  __m256 out1 = _mm256_permute_ps(simdreg.re, 0b11011000);
+  __m256 out2 = _mm256_permute_ps(simdreg.im, 0b11011000);
+  _mm256_store_ps((float*)(ptr), _mm256_unpacklo_ps(out1, out2));
+  _mm256_store_ps((float*)(ptr + 4), _mm256_unpackhi_ps(out1, out2));
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  _mm_store_ps((float*)(ptr), _mm_unpacklo_ps(simdreg.re, simdreg.im));
+  _mm_store_ps((float*)(ptr + 2), _mm_unpackhi_ps(simdreg.re, simdreg.im));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_cfi_storeu(cf_t *ptr, simd_cf_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  __m512 out1 = _mm512_permute_ps(simdreg.re, 0b11011000);
+  __m512 out2 = _mm512_permute_ps(simdreg.im, 0b11011000);
+  _mm512_storeu_ps((float*)(ptr), _mm512_unpacklo_ps(out1, out2));
+  _mm512_storeu_ps((float*)(ptr + 8), _mm512_unpackhi_ps(out1, out2));
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  __m256 out1 = _mm256_permute_ps(simdreg.re, 0b11011000);
+  __m256 out2 = _mm256_permute_ps(simdreg.im, 0b11011000);
+  _mm256_storeu_ps((float*)(ptr), _mm256_unpacklo_ps(out1, out2));
+  _mm256_storeu_ps((float*)(ptr + 4), _mm256_unpackhi_ps(out1, out2));
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  _mm_storeu_ps((float*)(ptr), _mm_unpacklo_ps(simdreg.re, simdreg.im));
+  _mm_storeu_ps((float*)(ptr + 2), _mm_unpackhi_ps(simdreg.re, simdreg.im));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_cf_store(float *re, float *im, simd_cf_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_store_ps(re, simdreg.re);
+  _mm512_store_ps(im, simdreg.im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_store_ps((float *) re, simdreg.re);
+  _mm256_store_ps((float *) im, simdreg.im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_SSE
+  _mm_store_ps((float *) re, simdreg.re);
+  _mm_store_ps((float *) im, simdreg.im);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_cf_storeu(float *re, float *im, simd_cf_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_storeu_ps(re, simdreg.re);
+  _mm512_storeu_ps(im, simdreg.im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_storeu_ps((float *) re, simdreg.re);
+  _mm256_storeu_ps((float *) im, simdreg.im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_SSE
+  _mm_storeu_ps((float *) re, simdreg.re);
+  _mm_storeu_ps((float *) im, simdreg.im);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_cf_t srslte_simd_cf_set1 (cf_t x) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_set1_ps(__real__ x);
+  ret.im = _mm512_set1_ps(__imag__ x);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_set1_ps(__real__ x);
+  ret.im = _mm256_set1_ps(__imag__ x);
+#else
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_set1_ps(__real__ x);
+  ret.im = _mm_set1_ps(__imag__ x);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_prod (simd_cf_t a, simd_cf_t b) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_sub_ps(_mm512_mul_ps(a.re, b.re),
+                         _mm512_mul_ps(a.im, b.im));
+  ret.im = _mm512_add_ps(_mm512_mul_ps(a.re, b.im),
+                         _mm512_mul_ps(a.im, b.re));
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_sub_ps(_mm256_mul_ps(a.re, b.re),
+                         _mm256_mul_ps(a.im, b.im));
+  ret.im = _mm256_add_ps(_mm256_mul_ps(a.re, b.im),
+                         _mm256_mul_ps(a.im, b.re));
+#else
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_sub_ps(_mm_mul_ps(a.re, b.re),
+                      _mm_mul_ps(a.im, b.im));
+  ret.im = _mm_add_ps(_mm_mul_ps(a.re, b.im),
+                      _mm_mul_ps(a.im, b.re));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_conjprod (simd_cf_t a, simd_cf_t b) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_add_ps(_mm512_mul_ps(a.re, b.re),
+                         _mm512_mul_ps(a.im, b.im));
+  ret.im = _mm512_sub_ps(_mm512_mul_ps(a.im, b.re),
+                         _mm512_mul_ps(a.re, b.im));
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_add_ps(_mm256_mul_ps(a.re, b.re),
+                         _mm256_mul_ps(a.im, b.im));
+  ret.im = _mm256_sub_ps(_mm256_mul_ps(a.im, b.re),
+                         _mm256_mul_ps(a.re, b.im));
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_add_ps(_mm_mul_ps(a.re, b.re),
+                      _mm_mul_ps(a.im, b.im));
+  ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re),
+                      _mm_mul_ps(a.re, b.im));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_add_ps(a.re, b.re);
+  ret.im = _mm512_add_ps(a.im, b.im);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_add_ps(a.re, b.re);
+  ret.im = _mm256_add_ps(a.im, b.im);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_add_ps(a.re, b.re);
+  ret.im = _mm_add_ps(a.im, b.im);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_zero (void) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  ret.re = _mm512_setzero_ps();
+  ret.im = _mm512_setzero_ps();
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  ret.re = _mm256_setzero_ps();
+  ret.im = _mm256_setzero_ps();
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_setzero_ps();
+  ret.im = _mm_setzero_ps();
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+#endif /* SRSLTE_SIMD_CF_SIZE */
+
+
+#if SRSLTE_SIMD_S_SIZE
+
+
+#ifdef LV_HAVE_AVX512
+typedef __m512i simd_s_t;
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+typedef __m256i simd_s_t;
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+typedef __m128i simd_s_t;
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+
+static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_load_si512(ptr);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_load_si256(ptr);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_load_si128(ptr);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_load_si512(ptr);
+#else /* LV_HAVE_AVX512 */
+  #ifdef LV_HAVE_AVX2
+  return _mm256_load_si256(ptr);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_load_si128(ptr);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_store_si512(ptr, simdreg);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_store_si256(ptr, simdreg);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  _mm_store_si128(ptr, simdreg);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_storeu_si512(ptr, simdreg);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_storeu_si256(ptr, simdreg);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  _mm_storeu_si128(ptr, simdreg);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_s_t srslte_simd_s_zero(void) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_setzero_si512();
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_setzero_si256();
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_setzero_si128();
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_mullo_epi16(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_mullo_epi16(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_mullo_epi16(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_s_t srslte_simd_s_add(simd_s_t a, simd_s_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_add_epi16(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_add_epi16(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_add_epi16(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_s_t srslte_simd_s_sub(simd_s_t a, simd_s_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_sub_epi16(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_sub_epi16(a, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_sub_epi16(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+#endif /* SRSLTE_SIMD_S_SIZE */
+
+
+#if SRSLTE_SIMD_C16_SIZE
+
+typedef struct {
+#ifdef LV_HAVE_AVX512
+    union {
+        __m512i m512;
+        int16_t i16[32];
+    } re;
+    union {
+        __m512i m512;
+        int16_t i16[32];
+    } im;
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  union {
+    __m256i m256;
+    int16_t i16[16];
+  } re;
+  union {
+    __m256i m256;
+    int16_t i16[16];
+  } im;
+#else
+#ifdef LV_HAVE_SSE
+  union {
+    __m128i m128;
+    int16_t i16[8];
+  } re;
+  union {
+    __m128i m128;
+    int16_t i16[8];
+  } im;
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+} simd_c16_t;
+
+/* Fixed point precision (16-bit) functions */
+static inline simd_c16_t srslte_simd_c16i_load(c16_t *ptr) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX512
+  __m512i in1 = _mm512_load_si512((__m512i*)(ptr));
+  __m512i in2 = _mm512_load_si512((__m512i*)(ptr + 8));
+  ret.re.m512 = _mm512_mask_blend_epi16(0xAAAAAAAA, in1,_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(in2, 0b10100000), 0b10100000));
+  ret.im.m512 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(in1, 0b11110101), 0b11110101),in2);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_AVX2
+  __m256i in1 = _mm256_load_si256((__m256i*)(ptr));
+  __m256i in2 = _mm256_load_si256((__m256i*)(ptr + 8));
+  ret.re.m256 = _mm256_blend_epi16(in1,_mm256_shufflelo_epi16(_mm256_shufflehi_epi16(in2, 0b10100000), 0b10100000), 0b10101010);
+  ret.im.m256 = _mm256_blend_epi16(_mm256_shufflelo_epi16(_mm256_shufflehi_epi16(in1, 0b11110101), 0b11110101),in2, 0b10101010);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  __m128i in1 = _mm_load_si128((__m128i*)(ptr));
+  __m128i in2 = _mm_load_si128((__m128i*)(ptr + 8));
+  ret.re.m128 = _mm_blend_epi16(in1,_mm_shufflelo_epi16(_mm_shufflehi_epi16(in2, 0b10100000), 0b10100000), 0b10101010);
+  ret.im.m128 = _mm_blend_epi16(_mm_shufflelo_epi16(_mm_shufflehi_epi16(in1, 0b11110101), 0b11110101),in2, 0b10101010);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
+static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX2
+  ret.re.m256 = _mm256_load_si256((__m256i*)(re));
+  ret.im.m256 = _mm256_load_si256((__m256i*)(im));
+#else
+#ifdef LV_HAVE_SSE
+  ret.re.m128 = _mm_load_si128((__m128i*)(re));
+  ret.im.m128 = _mm_load_si128((__m128i*)(im));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+  return ret;
+}
+
+static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
+#ifdef LV_HAVE_AVX2
+  __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
+  __m256i im_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.im.m256, 0b10110001), 0b10110001);
+  _mm256_store_si256((__m256i *) (ptr), _mm256_blend_epi16(simdreg.re.m256, im_sw, 0b10101010));
+  _mm256_store_si256((__m256i *) (ptr + 8), _mm256_blend_epi16(re_sw, simdreg.im.m256, 0b10101010));
+#else
+#ifdef LV_HAVE_SSE
+  __m128i re_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.re.m128, 0b10110001), 0b10110001);
+  __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
+  _mm_store_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
+  _mm_store_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
+static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t simdreg) {
+#ifdef LV_HAVE_AVX2
+  _mm256_store_si256((__m256i *) re, simdreg.re.m256);
+  _mm256_store_si256((__m256i *) im, simdreg.im.m256);
+#else
+#ifdef LV_HAVE_SSE
+  _mm_store_si128((__m128i *) re, simdreg.re.m128);
+  _mm_store_si128((__m128i *) im, simdreg.im.m128);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
+static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX2
+  ret.re.m256 = _mm256_sub_epi16(_mm256_mulhrs_epi16(a.re.m256, _mm256_slli_epi16(b.re.m256, 1)),
+                                 _mm256_mulhrs_epi16(a.im.m256, _mm256_slli_epi16(b.im.m256, 1)));
+  ret.im.m256 = _mm256_add_epi16(_mm256_mulhrs_epi16(a.re.m256, _mm256_slli_epi16(b.im.m256, 1)),
+                                 _mm256_mulhrs_epi16(a.im.m256, _mm256_slli_epi16(b.re.m256, 1)));
+#else
+#ifdef LV_HAVE_SSE
+  ret.re.m128 = _mm_sub_epi16(_mm_mulhrs_epi16(a.re.m128, _mm_slli_epi16(b.re.m128, 1)),
+                              _mm_mulhrs_epi16(a.im.m128, _mm_slli_epi16(b.im.m128, 1)));
+  ret.im.m128 = _mm_add_epi16(_mm_mulhrs_epi16(a.re.m128, _mm_slli_epi16(b.im.m128, 1)),
+                              _mm_mulhrs_epi16(a.im.m128, _mm_slli_epi16(b.re.m128, 1)));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+  return ret;
+}
+
+static inline simd_c16_t srslte_simd_c16_add (simd_c16_t a, simd_c16_t b) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX2
+  ret.re.m256 = _mm256_add_epi16(a.re.m256, b.re.m256);
+  ret.im.m256 = _mm256_add_epi16(a.im.m256, b.im.m256);
+#else
+#ifdef LV_HAVE_SSE
+  ret.re.m128 = _mm_add_epi16(a.re.m128, b.re.m128);
+  ret.im.m128 = _mm_add_epi16(a.im.m128, b.im.m128);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+  return ret;
+}
+
+static inline simd_c16_t srslte_simd_c16_zero (void) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX2
+  ret.re.m256 = _mm256_setzero_si256();
+  ret.im.m256 = _mm256_setzero_si256();
+#else
+#ifdef LV_HAVE_SSE
+  ret.re.m128 = _mm_setzero_si128();
+  ret.im.m128 = _mm_setzero_si128();
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+  return ret;
+}
+
+#endif /* SRSLTE_SIMD_C16_SIZE */
+
+
+
 #endif //SRSLTE_SIMD_H_H

From c9f6bfccd47d6dfcda2930e7cb81465e38fb3b9d Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Mon, 25 Sep 2017 13:19:34 +0200
Subject: [PATCH 20/55] Refactored vector library with SIMD independent
 architecture inline functions test-benchmark

---
 lib/include/srslte/phy/utils/vector.h      |    8 +-
 lib/include/srslte/phy/utils/vector_simd.h |   61 +-
 lib/src/phy/utils/test/CMakeLists.txt      |    3 +
 lib/src/phy/utils/test/vector_test.c       |  555 ++++++++++
 lib/src/phy/utils/vector.c                 |  228 +---
 lib/src/phy/utils/vector_simd.c            | 1120 ++++++++++----------
 6 files changed, 1187 insertions(+), 788 deletions(-)
 create mode 100644 lib/src/phy/utils/test/vector_test.c

diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h
index 4a55d18b6..0fadfb334 100644
--- a/lib/include/srslte/phy/utils/vector.h
+++ b/lib/include/srslte/phy/utils/vector.h
@@ -80,8 +80,8 @@ SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, uint32_t len)
 SRSLTE_API void srslte_vec_sum_ch(uint8_t *x, uint8_t *y, char *z, uint32_t len);
 SRSLTE_API void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len);
 SRSLTE_API void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
-SRSLTE_API void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len);
-SRSLTE_API void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_sub_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_sum_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len);
 
 /* substract two vectors z=x-y */
 SRSLTE_API void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len); 
@@ -91,7 +91,7 @@ SRSLTE_API void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len); 
 
 /* Square distance */
-SRSLTE_API void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints);
+//SRSLTE_API void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints);
 
 /* scalar addition */
 SRSLTE_API void srslte_vec_sc_add_fff(float *x, float h, float *z, uint32_t len); 
@@ -132,7 +132,7 @@ SRSLTE_API void srslte_vec_prod_conj_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len
 
 /* real vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_fff(float *x, float *y, float *z, uint32_t len);
-SRSLTE_API void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len);
 
 /* Dot-product */
 SRSLTE_API cf_t srslte_vec_dot_prod_cfc(cf_t *x, float *y, uint32_t len);
diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h
index 1010cbed6..8ea2ce9bc 100644
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@@ -35,47 +35,66 @@ extern "C" {
 #include <stdint.h>
 #include "srslte/config.h"
 
-SRSLTE_API int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len); 
+#ifdef LV_HAVE_AVX512
+#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0)
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX
+#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0)
+#else /* LV_HAVE_AVX */
+#ifdef LV_HAVE_SSE
+#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
+#else /* LV_HAVE_SSE */
+#define SRSLTE_IS_ALIGNED(PTR) (true)
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX */
+#endif /* LV_HAVE_AVX512 */
 
-SRSLTE_API int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len); 
+SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
 
-SRSLTE_API void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
 
-SRSLTE_API void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len); 
+SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
 
 SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);
 
-SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len);
+SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len);
 
-SRSLTE_API void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len);
+SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len);
 
-SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len);
+SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len);
 
-SRSLTE_API void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len);
+SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len);
 
-SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len);
+SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len);
 
-SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len);
 
-SRSLTE_API void srslte_vec_prod_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
 
-SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
 
-SRSLTE_API void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
 
-SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
+SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
+                                             int16_t *r_im, int len);
 
-SRSLTE_API void srslte_vec_prod_conj_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
+
+SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len);
+
+SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len);
 
 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
 
+SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len);
+
 SRSLTE_API  void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);
 
-SRSLTE_API void srslte_vec_abs_square_cf_sse(cf_t *x, float *z, uint32_t len);
+SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len);
 
-SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); 
+SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len);
+
+SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
 
 SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len);
 
@@ -93,7 +112,9 @@ SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y,
 
 SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len);
 
-SRSLTE_API void srslte_vec_sc_prod_cfc_avx(const cf_t *x,const float h,cf_t *y,const uint32_t len);
+SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
+
+SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len);
 
 #ifdef __cplusplus
 }
diff --git a/lib/src/phy/utils/test/CMakeLists.txt b/lib/src/phy/utils/test/CMakeLists.txt
index 4dccbf2a0..76df7ac59 100644
--- a/lib/src/phy/utils/test/CMakeLists.txt
+++ b/lib/src/phy/utils/test/CMakeLists.txt
@@ -42,3 +42,6 @@ target_link_libraries(algebra_test srslte_phy)
 
 add_test(algebra_2x2_zf_solver_test algebra_test -z)
 add_test(algebra_2x2_mmse_solver_test algebra_test -m)
+
+add_executable(vector_test vector_test.c)
+target_link_libraries(vector_test srslte_phy)
\ No newline at end of file
diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c
new file mode 100644
index 000000000..e781d05b9
--- /dev/null
+++ b/lib/src/phy/utils/test/vector_test.c
@@ -0,0 +1,555 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <complex.h>
+#include <stdbool.h>
+#include <immintrin.h>
+#include <sys/time.h>
+#include <srslte/phy/utils/vector_simd.h>
+#include <memory.h>
+#include <math.h>
+
+#include "srslte/phy/utils/mat.h"
+#include "srslte/phy/utils/simd.h"
+#include "srslte/phy/utils/vector.h"
+
+
+bool zf_solver = false;
+bool mmse_solver = false;
+bool verbose = false;
+
+#define MAX_MSE (1e-3)
+#define NOF_REPETITIONS (1024*128)
+#define MAX_FUNCTIONS (64)
+#define MAX_BLOCKS (16)
+
+#define RANDOM_F() ((float)rand())/((float)RAND_MAX)
+#define RANDOM_S() ((int16_t)(rand() && 0x800F))
+#define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F())
+
+#define TEST_CALL(TEST_CODE)   gettimeofday(&start, NULL);\
+  for (int i = 0; i < NOF_REPETITIONS; i++){TEST_CODE;}\
+  gettimeofday(&end, NULL); \
+  *timing = elapsed_us(&start, &end);
+
+#define TEST(X, CODE) static bool test_##X (char *func_name, double *timing, uint32_t block_size) {\
+    struct timeval start, end;\
+    float mse = 0.0f;\
+    bool passed;\
+    strncpy(func_name, #X, 32);\
+    CODE;\
+    passed = (mse < MAX_MSE);\
+    printf("%32s (%5d) ... %7.1f MSamp/s ... %3s Passed\n", func_name, block_size, \
+    (double) block_size*NOF_REPETITIONS/ *timing, passed?"":"Not");\
+    return passed;\
+}
+
+#define MALLOC(TYPE, NAME) TYPE *NAME = srslte_vec_malloc(sizeof(TYPE)*block_size)
+
+
+static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
+  if (ts_end->tv_usec > ts_start->tv_usec) {
+    return ((double) ts_end->tv_sec - (double) ts_start->tv_sec) * 1000000 +
+           (double) ts_end->tv_usec - (double) ts_start->tv_usec;
+  } else {
+    return ((double) ts_end->tv_sec - (double) ts_start->tv_sec - 1) * 1000000 +
+           ((double) ts_end->tv_usec + 1000000) - (double) ts_start->tv_usec;
+  }
+}
+
+float squared_error (cf_t a, cf_t b) {
+  float diff_re = __real__ a - __real__ b;
+  float diff_im = __imag__ a - __imag__ b;
+  return diff_re*diff_re + diff_im*diff_im;
+}
+
+TEST(srslte_vec_dot_prod_sss,
+     MALLOC(int16_t, x);
+         MALLOC(int16_t, y);
+         int16_t z;
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_S();
+           y[i] = RANDOM_S();
+         }
+
+         TEST_CALL(z = srslte_vec_dot_prod_sss(x, y, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold += x[i] * y[i];
+         }
+
+         mse += cabsf(gold - z) / cabsf(gold);
+
+         free(x);
+         free(y);
+)
+
+TEST(srslte_vec_sum_sss,
+     MALLOC(int16_t, x);
+         MALLOC(int16_t, y);
+         MALLOC(int16_t, z);
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_S();
+           y[i] = RANDOM_S();
+         }
+
+         TEST_CALL(srslte_vec_sum_sss(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] + y[i];
+           mse += cabsf(gold - z[i]);
+         }
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+TEST(srslte_vec_sub_sss,
+     MALLOC(int16_t, x);
+         MALLOC(int16_t, y);
+         MALLOC(int16_t, z);
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_S();
+           y[i] = RANDOM_S();
+         }
+
+         TEST_CALL(srslte_vec_sub_sss(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] - y[i];
+           mse += cabsf(gold - z[i]);
+         }
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+TEST(srslte_vec_prod_sss,
+     MALLOC(int16_t, x);
+         MALLOC(int16_t, y);
+         MALLOC(int16_t, z);
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_S();
+           y[i] = RANDOM_S();
+         }
+
+         TEST_CALL(srslte_vec_prod_sss(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] * y[i];
+           mse += cabsf(gold - z[i]);
+         }
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+TEST(srslte_vec_acc_cc,
+     MALLOC(cf_t, x);
+         cf_t z;
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+         }
+
+         TEST_CALL(z = srslte_vec_acc_cc(x, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold += x[i];
+         }
+
+         mse += cabsf(gold - z)/cabsf(gold);
+
+         free(x);
+)
+
+
+TEST(srslte_vec_sum_fff,
+     MALLOC(float, x);
+         MALLOC(float, y);
+         MALLOC(float, z);
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+         x[i] = RANDOM_F();
+         y[i] = RANDOM_F();
+     }
+
+         TEST_CALL(srslte_vec_sum_fff(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+         gold = x[i] + y[i];
+         mse += cabsf(gold - z[i]);
+     }
+
+         free(x);
+         free(y);
+)
+
+TEST(srslte_vec_sub_fff,
+     MALLOC(float, x);
+         MALLOC(float, y);
+         MALLOC(float, z);
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+         x[i] = RANDOM_F();
+         y[i] = RANDOM_F();
+     }
+
+         TEST_CALL(srslte_vec_sub_fff(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+         gold = x[i] - y[i];
+         mse += cabsf(gold - z[i]);
+     }
+
+         free(x);
+         free(y);
+)
+
+TEST(srslte_vec_dot_prod_ccc,
+     MALLOC(cf_t, x);
+         MALLOC(cf_t, y);
+         cf_t z;
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+           y[i] = RANDOM_CF();
+         }
+
+         TEST_CALL(z = srslte_vec_dot_prod_ccc(x, y, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold += x[i] * y[i];
+         }
+
+         mse = cabsf(gold - z) / cabsf(gold);
+
+         free(x);
+         free(y);
+)
+
+TEST(srslte_vec_dot_prod_conj_ccc,
+     MALLOC(cf_t, x);
+         MALLOC(cf_t, y);
+         cf_t z;
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+           y[i] = RANDOM_CF();
+         }
+
+         TEST_CALL(z = srslte_vec_dot_prod_conj_ccc(x, y, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold += x[i] * conjf(y[i]);
+         }
+
+         mse = cabsf(gold - z) / cabsf(gold);
+
+         free(x);
+         free(y);
+)
+
+TEST(srslte_vec_prod_ccc,
+  MALLOC(cf_t, x);
+  MALLOC(cf_t, y);
+  MALLOC(cf_t, z);
+
+  cf_t gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+    y[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_prod_ccc(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * y[i];
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_prod_conj_ccc,
+  MALLOC(cf_t, x);
+  MALLOC(cf_t, y);
+  MALLOC(cf_t, z);
+
+  cf_t gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+    y[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_prod_conj_ccc(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * conjf(y[i]);
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_sc_prod_ccc,
+  MALLOC(cf_t, x);
+  MALLOC(cf_t, z);
+  cf_t y = RANDOM_F();
+
+  cf_t gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_sc_prod_ccc(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * y;
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_prod_fff,
+  MALLOC(float, x);
+  MALLOC(float, y);
+  MALLOC(float, z);
+
+  cf_t gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+    y[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_prod_fff(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * y[i];
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_sc_prod_fff,
+  MALLOC(float, x);
+  MALLOC(float, z);
+  float y = RANDOM_F();
+
+  float gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_sc_prod_fff(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * y;
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_abs_cf,
+  MALLOC(cf_t, x);
+  MALLOC(float, z);
+  float gold;
+
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_abs_cf(x, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = sqrtf(crealf(x[i]) * crealf(x[i]) + cimagf(x[i])*cimagf(x[i]));
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_abs_square_cf,
+  MALLOC(cf_t, x);
+  MALLOC(float, z);
+  float gold;
+
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_abs_square_cf(x, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = crealf(x[i]) * crealf(x[i]) + cimagf(x[i])*cimagf(x[i]);
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+TEST(srslte_vec_sc_prod_cfc,
+  MALLOC(cf_t, x);
+  MALLOC(cf_t, z);
+  cf_t gold;
+  float h = RANDOM_F();
+
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+  }
+
+  TEST_CALL(srslte_vec_sc_prod_cfc(x, h, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * h;
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
+int main(int argc, char **argv) {
+  char func_names[MAX_FUNCTIONS][32];
+  double timmings[MAX_FUNCTIONS][MAX_BLOCKS];
+  uint32_t sizes[32];
+  uint32_t size_count = 0;
+  uint32_t func_count = 0;
+  bool passed = true;
+
+  for (uint32_t block_size = 1; block_size <= 1024*16; block_size *= 2) {
+    func_count = 0;
+
+    passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    sizes[size_count] = block_size;
+    size_count++;
+  }
+
+  printf("\n");
+  printf("%32s |", "Subroutine/MSps");
+  for (int i = 0; i < size_count; i++) {
+    printf(" %7d", sizes[i]);
+  }
+  printf("  |\n");
+
+  for (int j = 0; j < 32; j++) {
+    printf("-");
+  }
+  printf("-+-");
+  for (int j = 0; j < size_count; j++) {
+    printf("--------");
+  }
+  printf("-|\n");
+
+  for (int i = 0; i < func_count; i++) {
+    printf("%32s | ", func_names[i]);
+    for (int j = 0; j < size_count; j++) {
+      printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+    }
+    printf(" |\n");
+  }
+
+  return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
+}
diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c
index 917810e92..cb21f24f1 100644
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@@ -36,25 +36,6 @@
 #include "srslte/phy/utils/bit.h"
 
 
-#ifdef LV_HAVE_SSE
-#include <smmintrin.h>
-#endif
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-#endif
-
-
-#ifdef HAVE_VOLK
-#include "volk/volk.h"
-#endif
-
-#ifdef DEBUG_MODE
-#warning FIXME: Disabling SSE/AVX vector code
-#undef LV_HAVE_SSE
-#undef LV_HAVE_AVX
-#endif
-
 
 int srslte_vec_acc_ii(int *x, uint32_t len) {
   int i;
@@ -88,51 +69,25 @@ void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float co
 }
 
 cf_t srslte_vec_acc_cc(cf_t *x, uint32_t len) {
-  int i;
-  cf_t z=0;
-  for (i=0;i<len;i++) {
-    z+=x[i];
-  }
-  return z;
+  return srslte_vec_acc_cc_simd(x, len);
 }
 
-void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints) {
+#warning Remove function if not used!
+/*void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints) {
   uint32_t i;
   cf_t diff; 
   for (i=0;i<npoints;i++) {
     diff = symbol - points[i];
     distance[i] = crealf(diff) * crealf(diff) + cimagf(diff) * cimagf(diff);
   }
-}
+}*/
 
 void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]-y[i];
-  }
-#else
-#ifdef LV_HAVE_AVX
-  srslte_vec_sub_fff_avx(x, y, z, len);
-#else
-  srslte_vec_sub_fff_sse(x, y, z, len);
-#endif
-#endif
+  srslte_vec_sub_fff_simd(x, y, z, len);
 }
 
-void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX2
-  srslte_vec_sub_sss_avx2(x, y, z, len);
-#else
-#ifdef LV_HAVE_SSE
-  srslte_vec_sub_sss_sse(x, y, z, len);
-#else
-    int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]-y[i];
-  }
-#endif
-#endif
+void srslte_vec_sub_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len) {
+  srslte_vec_sub_sss_simd(x, y, z, len);
 }
 
 // Noise estimation in chest_dl, interpolation 
@@ -142,33 +97,11 @@ void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
 
 // Used in PSS/SSS and sum_ccc
 void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+y[i];
-  }
-#else
-  #ifdef LV_HAVE_AVX
-    srslte_vec_sum_fff_avx(x, y, z, len);
-  #else
-    srslte_vec_sum_fff_sse(x, y, z, len);
-  #endif
-#endif
+  srslte_vec_add_fff_simd(x, y, z, len);
 }
 
-void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX2
-  srslte_vec_sum_sss_avx2(x, y, z, len);
-#else
-#ifdef LV_HAVE_SSE
-  srslte_vec_sum_sss_sse(x, y, z, len);
-#else
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+y[i];
-  }
-#endif
-#endif
+void srslte_vec_sum_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len) {
+  srslte_vec_sum_sss_simd(x, y, z, len);
 }
 
 void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
@@ -197,7 +130,7 @@ void srslte_vec_sc_add_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
 }
 
 void srslte_vec_sc_add_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
-  int i; 
+  int i;
   for (i=0;i<len;i++) {
     z[i] = x[i]+ h;
   }
@@ -211,14 +144,7 @@ void srslte_vec_sc_add_sss(int16_t *x, int16_t h, int16_t *z, uint32_t len) {
 }
 // PSS, PBCH, DEMOD, FFTW, etc. 
 void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*h;
-  }
-#else
-  srslte_vec_sc_prod_fff_sse(x, h, z, len);
-#endif
+  srslte_vec_sc_prod_fff_simd(x, h, z, len);
 }
 
 void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
@@ -228,7 +154,8 @@ void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
   }
 }
 
-void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) {
+#warning remove function if it is not used
+/*void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) {
 #ifdef LV_HAVE_AVX2
   srslte_vec_sc_div2_sss_avx2(x, n_rightshift, z, len);
 #else
@@ -242,7 +169,7 @@ void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len)
   }
 #endif
 #endif
-}
+}*/
 
 // TODO: Improve this implementation
 void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
@@ -257,14 +184,7 @@ void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
 
 // Used throughout 
 void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) { 
-#ifdef LV_HAVE_AVX
-  srslte_vec_sc_prod_cfc_avx(x,h,z,len);
-#else
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*h;
-  }
-#endif
+  srslte_vec_sc_prod_cfc_simd(x,h,z,len);
 }
 
 
@@ -276,7 +196,7 @@ void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
     z[i] = x[i]*h;
   }
 #else
-  srslte_vec_sc_prod_ccc_sse(x,h,z,len);
+  srslte_vec_sc_prod_ccc_simd(x,h,z,len);
 #endif
 }
 
@@ -360,7 +280,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
  */
 void *srslte_vec_malloc(uint32_t size) {
   void *ptr;
-  if (posix_memalign(&ptr,256,size)) {
+  if (posix_memalign(&ptr,512,size)) {
     return NULL;
   } else {
     return ptr;
@@ -511,50 +431,22 @@ void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
 
 // Used in scrambling float
 void srslte_vec_prod_fff(float *x, float *y, float *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*y[i];
-  }
+  srslte_vec_prod_fff_simd(x, y, z, len);
 }
 
 // Scrambling Short
-void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX2
-  srslte_vec_prod_sss_avx2(x,y,z,len);
-#else
-#ifdef LV_HAVE_SSE
-  srslte_vec_prod_sss_sse(x,y,z,len);
-#else
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*y[i];
-  }
-#endif
-#endif
+void srslte_vec_prod_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len) {
+  srslte_vec_prod_sss_simd(x,y,z,len);
 }
 
 // CFO and OFDM processing
 void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*y[i];
-  }
-#else
-  srslte_vec_prod_ccc_sse(x,y,z,len);
-#endif
+  srslte_vec_prod_ccc_simd(x,y,z,len);
 }
 
 // PRACH, CHEST UL, etc. 
 void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*conjf(y[i]);
-  }
-#else
-  srslte_vec_prod_conj_ccc_sse(x,y,z,len);
-#endif
+  srslte_vec_prod_conj_ccc_simd(x,y,z,len);
 }
 
 //#define DIV_USE_VEC
@@ -598,16 +490,7 @@ void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
 
 // PSS. convolution 
 cf_t srslte_vec_dot_prod_ccc(cf_t *x, cf_t *y, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  uint32_t i;
-  cf_t res = 0;
-  for (i=0;i<len;i++) {
-    res += x[i]*y[i];
-  }
-  return res;
-#else
-  return srslte_vec_dot_prod_ccc_sse(x, y, len);
-#endif
+  return srslte_vec_dot_prod_ccc_simd(x, y, len);
 }
 
 // Convolution filter and in SSS search 
@@ -622,17 +505,7 @@ cf_t srslte_vec_dot_prod_cfc(cf_t *x, float *y, uint32_t len) {
 
 // SYNC 
 cf_t srslte_vec_dot_prod_conj_ccc(cf_t *x, cf_t *y, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  uint32_t i;
-  cf_t res = 0;
-  for (i=0;i<len;i++) {
-    res += x[i]*conjf(y[i]);
-  }
-  return res;
-#else
-  return srslte_vec_dot_prod_conj_ccc_sse(x, y, len);
-#endif
-  
+  return srslte_vec_dot_prod_conj_ccc_simd(x, y, len);
 }
 
 // PHICH 
@@ -646,20 +519,7 @@ float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len) {
 }
 
 int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len) {
-#ifdef LV_HAVE_AVX2
-  return srslte_vec_dot_prod_sss_avx2(x, y, len);
-#else
-#ifdef LV_HAVE_SSE
-  return srslte_vec_dot_prod_sss_sse(x, y, len);
-#else
-  uint32_t i;
-  int32_t res = 0;
-  for (i=0;i<len;i++) {
-    res += x[i]*y[i];
-  }
-  return res;
-#endif
-#endif
+  return srslte_vec_dot_prod_sss_simd(x, y, len);
 }
 
 float srslte_vec_avg_power_cf(cf_t *x, uint32_t len) {
@@ -672,27 +532,17 @@ float srslte_vec_corr_ccc(cf_t *x, cf_t *y, uint32_t len) {
   float s_x = crealf(srslte_vec_dot_prod_conj_ccc(x, x, len))/len;
   float s_y = crealf(srslte_vec_dot_prod_conj_ccc(y, y, len))/len;
   float cov = crealf(srslte_vec_dot_prod_conj_ccc(x, y, len))/len;
-  return cov/(sqrt(s_x*s_y));
+  return cov/(sqrtf(s_x*s_y));
 }
 
 // PSS (disabled and using abs_square )
 void srslte_vec_abs_cf(cf_t *x, float *abs, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    abs[i] = cabsf(x[i]);
-  }
+  srslte_vec_abs_cf_simd(x, abs, len);
 }
 
 // PRACH 
 void srslte_vec_abs_square_cf(cf_t *x, float *abs_square, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    abs_square[i] = crealf(x[i])*crealf(x[i])+cimagf(x[i])*cimagf(x[i]);
-  }
-#else
-  srslte_vec_abs_square_cf_sse(x,abs_square,len);
-#endif
+  srslte_vec_abs_square_cf_simd(x,abs_square,len);
 }
 
 
@@ -821,23 +671,5 @@ void srslte_vec_quant_suc(int16_t *in, uint8_t *out, float gain, int16_t offset,
 }
 
 void srs_vec_cf_cpy(cf_t *dst, cf_t *src, int len) {
-  int i = 0;
-
-#ifdef LV_HAVE_AVX
-    for (; i < len - 3; i += 4) {
-      _mm256_store_ps((float *) &dst[i], _mm256_load_ps((float *) &src[i]));
-    }
-#endif /* LV_HAVE_AVX */
-#ifdef LV_HAVE_SSE
-    for (; i < len - 1; i += 2) {
-      _mm_store_ps((float *) &dst[i], _mm_load_ps((float *) &src[i]));
-    }
-  for (; i < len; i++) {
-    ((__m64*) dst)[i] = ((__m64*) src)[i];
-  }
-#else
-  for (; i < len; i++) {
-    dst[i] = src[i];
-  }
-#endif /* LV_HAVE_SSE */
+  srslte_vec_cp_simd(dst, src, len);
 }
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index d38373d80..21132390f 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -25,310 +25,148 @@
  */
 
 
-#include <float.h>
 #include <complex.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-
-#include "srslte/phy/utils/vector_simd.h"
-
 #include <inttypes.h>
 #include <stdio.h>
 
-#ifdef LV_HAVE_SSE
-#include <smmintrin.h>
-#endif
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-#endif
+#include <srslte/config.h>
+#include "srslte/phy/utils/vector_simd.h"
+#include "srslte/phy/utils/simd.h"
 
 
-int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len)
-{
-  int result = 0; 
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
+int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len) {
+  int i = 0;
+  int result = 0;
+#if SRSLTE_SIMD_S_SIZE
+  simd_s_t simd_dotProdVal = srslte_simd_s_zero();
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_load(&x[i]);
+      simd_s_t b = srslte_simd_s_load(&y[i]);
 
-  const __m128i* xPtr = (const __m128i*) x;
-  const __m128i* yPtr = (const __m128i*) y;
-  
-  __m128i dotProdVal = _mm_setzero_si128();
+      simd_s_t z = srslte_simd_s_mul(a, b);
 
-  __m128i xVal, yVal, zVal;
-  for(;number < points; number++){
+      simd_dotProdVal = srslte_simd_s_add(simd_dotProdVal, z);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_loadu(&x[i]);
+      simd_s_t b = srslte_simd_s_loadu(&y[i]);
 
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_loadu_si128(yPtr);
+      simd_s_t z = srslte_simd_s_mul(a, b);
 
-    zVal = _mm_mullo_epi16(xVal, yVal);
-
-    dotProdVal = _mm_add_epi16(dotProdVal, zVal);
-
-    xPtr ++;
-    yPtr ++;
+      simd_dotProdVal = srslte_simd_s_add(simd_dotProdVal, z);
+    }
   }
-  
-  short dotProdVector[8];
-  _mm_store_si128((__m128i*) dotProdVector, dotProdVal);
-  for (int i=0;i<8;i++) {
-    result += dotProdVector[i]; 
+  __attribute__ ((aligned (SRSLTE_SIMD_S_SIZE*2))) short dotProdVector[SRSLTE_SIMD_S_SIZE];
+  srslte_simd_s_store(dotProdVector, simd_dotProdVal);
+  for (int k = 0; k < SRSLTE_SIMD_S_SIZE; k++) {
+    result += dotProdVector[k];
+  }
+#endif /* SRSLTE_SIMD_S_SIZE */
+
+  for(; i < len; i++){
+    result += (x[i] * y[i]);
   }
 
-  number = points * 8;
-  for(;number < len; number++){
-    result += (x[number] * y[number]);
-  }
-  
-#endif
   return result; 
 }
 
+void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
+  int i = 0;
+#ifdef SRSLTE_SIMD_S_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_load(&x[i]);
+      simd_s_t b = srslte_simd_s_load(&y[i]);
 
-int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len)
-{
-  int result = 0; 
-#ifdef LV_HAVE_AVX2
-  unsigned int number = 0;
-  const unsigned int points = len / 16;
+      simd_s_t r = srslte_simd_s_add(a, b);
 
-  const __m256i* xPtr = (const __m256i*) x;
-  const __m256i* yPtr = (const __m256i*) y;
-  
-  __m256i dotProdVal = _mm256_setzero_si256();
+      srslte_simd_s_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_loadu(&x[i]);
+      simd_s_t b = srslte_simd_s_loadu(&y[i]);
 
-  __m256i xVal, yVal, zVal;
-  for(;number < points; number++){
+      simd_s_t r = srslte_simd_s_add(a, b);
 
-    xVal = _mm256_load_si256(xPtr);
-    yVal = _mm256_loadu_si256(yPtr);
-    zVal = _mm256_mullo_epi16(xVal, yVal);
-    dotProdVal = _mm256_add_epi16(dotProdVal, zVal);
-    xPtr ++;
-    yPtr ++;
-  }
-  
-  __attribute__ ((aligned (256))) short dotProdVector[16];
-  _mm256_store_si256((__m256i*) dotProdVector, dotProdVal);
-  for (int i=0;i<16;i++) {
-    result += dotProdVector[i]; 
+      srslte_simd_s_storeu(&z[i], r);
+    }
   }
+#endif /* SRSLTE_SIMD_S_SIZE */
 
-  number = points * 16;
-  for(;number < len; number++){
-    result += (x[number] * y[number]);
+  for(; i < len; i++){
+    z[i] = x[i] + y[i];
   }
-  
-#endif
-  return result; 
 }
 
+void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
+  int i = 0;
+#ifdef SRSLTE_SIMD_S_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_load(&x[i]);
+      simd_s_t b = srslte_simd_s_load(&y[i]);
 
+      simd_s_t r = srslte_simd_s_sub(a, b);
 
-void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
+      srslte_simd_s_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_loadu(&x[i]);
+      simd_s_t b = srslte_simd_s_loadu(&y[i]);
 
-  const __m128i* xPtr = (const __m128i*) x;
-  const __m128i* yPtr = (const __m128i*) y;
-  __m128i* zPtr = (__m128i*) z;
+      simd_s_t r = srslte_simd_s_sub(a, b);
 
-  __m128i xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
-
-    zVal = _mm_add_epi16(xVal, yVal);
-
-    _mm_store_si128(zPtr, zVal); 
-
-    xPtr ++;
-    yPtr ++;
-    zPtr ++;
+      srslte_simd_s_storeu(&z[i], r);
+    }
   }
+#endif /* SRSLTE_SIMD_S_SIZE */
 
-  number = points * 8;
-  for(;number < len; number++){
-    z[number] = x[number] + y[number];
+  for(; i < len; i++){
+    z[i] = x[i] - y[i];
   }
-#endif
-
 }
 
-void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_AVX2
-  unsigned int number = 0;
-  const unsigned int points = len / 16;
+void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
+  int i = 0;
+#ifdef SRSLTE_SIMD_S_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_load(&x[i]);
+      simd_s_t b = srslte_simd_s_load(&y[i]);
 
-  const __m256i* xPtr = (const __m256i*) x;
-  const __m256i* yPtr = (const __m256i*) y;
-  __m256i* zPtr = (__m256i*) z;
+      simd_s_t r = srslte_simd_s_mul(a, b);
 
-  __m256i xVal, yVal, zVal;
-  for(;number < points; number++){
+      srslte_simd_s_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_s_t a = srslte_simd_s_loadu(&x[i]);
+      simd_s_t b = srslte_simd_s_loadu(&y[i]);
 
-    xVal = _mm256_load_si256(xPtr);
-    yVal = _mm256_loadu_si256(yPtr);
+      simd_s_t r = srslte_simd_s_mul(a, b);
 
-    zVal = _mm256_add_epi16(xVal, yVal);
-    _mm256_store_si256(zPtr, zVal); 
-
-    xPtr ++;
-    yPtr ++;
-    zPtr ++;
+      srslte_simd_s_storeu(&z[i], r);
+    }
   }
+#endif /* SRSLTE_SIMD_S_SIZE */
 
-  number = points * 16;
-  for(;number < len; number++){
-    z[number] = x[number] + y[number];
+  for(; i < len; i++){
+    z[i] = x[i] * y[i];
   }
-#endif
-
-}
-
-
-void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const __m128i* xPtr = (const __m128i*) x;
-  const __m128i* yPtr = (const __m128i*) y;
-  __m128i* zPtr = (__m128i*) z;
-
-  __m128i xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
-
-    zVal = _mm_sub_epi16(xVal, yVal);
-
-    _mm_store_si128(zPtr, zVal);
-
-    xPtr ++;
-    yPtr ++;
-    zPtr ++;
-  }
-
-  number = points * 8;
-  for(;number < len; number++){
-    z[number] = x[number] - y[number];
-  }
-#endif
-}
-
-void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_AVX2
-  unsigned int number = 0;
-  const unsigned int points = len / 16;
-
-  const __m256i* xPtr = (const __m256i*) x;
-  const __m256i* yPtr = (const __m256i*) y;
-  __m256i* zPtr = (__m256i*) z;
-
-  __m256i xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm256_load_si256(xPtr);
-    yVal = _mm256_loadu_si256(yPtr);
-
-    zVal = _mm256_sub_epi16(xVal, yVal);
-
-    _mm256_store_si256(zPtr, zVal); 
-
-    xPtr ++;
-    yPtr ++;
-    zPtr ++;
-  }
-
-  number = points * 16;
-  for(;number < len; number++){
-    z[number] = x[number] - y[number];
-  }
-  #endif
 }
 
 
 
 
-void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const __m128i* xPtr = (const __m128i*) x;
-  const __m128i* yPtr = (const __m128i*) y;
-  __m128i* zPtr = (__m128i*) z;
-
-  __m128i xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm_load_si128(xPtr);
-    yVal = _mm_load_si128(yPtr);
-
-    zVal = _mm_mullo_epi16(xVal, yVal);
-
-    _mm_store_si128(zPtr, zVal);
-
-    xPtr ++;
-    yPtr ++;
-    zPtr ++;
-  }
-
-  number = points * 8;
-  for(;number < len; number++){
-    z[number] = x[number] * y[number];
-  }
-#endif
-}
-
-void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_AVX2
-  unsigned int number = 0;
-  const unsigned int points = len / 16;
-
-  const __m256i* xPtr = (const __m256i*) x;
-  const __m256i* yPtr = (const __m256i*) y;
-  __m256i* zPtr = (__m256i*) z;
-
-  __m256i xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm256_loadu_si256(xPtr);
-    yVal = _mm256_loadu_si256(yPtr);
-
-    zVal = _mm256_mullo_epi16(xVal, yVal);
-
-    _mm256_storeu_si256(zPtr, zVal); 
-
-    xPtr ++;
-    yPtr ++;
-    zPtr ++;
-  }
-
-  number = points * 16;
-  for(;number < len; number++){
-    z[number] = x[number] * y[number];
-  }
-#endif
-}
-
-
-
-
-
-
+#warning remove function if it is not used
+/*
 void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len)
 {
 #ifdef LV_HAVE_SSE
@@ -357,8 +195,10 @@ void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len)
     z[number] = x[number] / divn;
   }
 #endif
-}
+}*/
 
+#warning remove function if it is not used
+/*
 void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
 {
 #ifdef LV_HAVE_AVX2
@@ -387,7 +227,7 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
     z[number] = x[number] / divn;
   }
 #endif
-}
+}*/
 
 
 
@@ -531,379 +371,527 @@ void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
 #endif
 }
 
-void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 4;
+cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) {
+  int i = 0;
+  cf_t acc_sum = 0.0f;
 
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  float* zPtr = (float*) z;
+#if SRSLTE_SIMD_F_SIZE
+  simd_f_t simd_sum = srslte_simd_f_zero();
 
-  __m128 xVal, yVal, zVal;
-  for(;number < points; number++){
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t a = srslte_simd_f_load((float *) &x[i]);
 
-    xVal = _mm_loadu_ps(xPtr);
-    yVal = _mm_loadu_ps(yPtr);
+      simd_sum = srslte_simd_f_add(simd_sum, a);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t a = srslte_simd_f_loadu((float *) &x[i]);
 
-    zVal = _mm_sub_ps(xVal, yVal);
-
-    _mm_storeu_ps(zPtr, zVal);
-
-    xPtr += 4;
-    yPtr += 4;
-    zPtr += 4;
+      simd_sum = srslte_simd_f_add(simd_sum, a);
+    }
   }
 
-  for(number = points * 4;number < len; number++){
-    z[number] = x[number] - y[number];
+  __attribute__((aligned(64))) cf_t sum[SRSLTE_SIMD_F_SIZE/2];
+  srslte_simd_f_store((float*)&sum, simd_sum);
+  for (int k = 0; k < SRSLTE_SIMD_F_SIZE/2; k++) {
+    acc_sum += sum[k];
   }
 #endif
+
+  for (; i<len; i++) {
+    acc_sum += x[i];
+  }
+  return acc_sum;
 }
 
+void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len) {
+  int i = 0;
 
-void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
 
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  float* zPtr = (float*) z;
+      simd_f_t r = srslte_simd_f_add(a, b);
 
-  __m256 xVal, yVal, zVal;
-  for(;number < points; number++){
+      srslte_simd_f_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
 
-    xVal = _mm256_loadu_ps(xPtr);
-    yVal = _mm256_loadu_ps(yPtr);
+      simd_f_t r = srslte_simd_f_add(a, b);
 
-    zVal = _mm256_sub_ps(xVal, yVal);
-
-    _mm256_storeu_ps(zPtr, zVal);
-
-    xPtr += 8;
-    yPtr += 8;
-    zPtr += 8;
-  }
-
-  for(number = points * 8;number < len; number++){
-    z[number] = x[number] - y[number];
+      srslte_simd_f_storeu(&z[i], r);
+    }
   }
 #endif
+
+  for (; i<len; i++) {
+    z[i] = x[i] + y[i];
+  }
 }
 
-#ifdef LV_HAVE_SSE
-static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
-  __m128 yl, yh, tmp1, tmp2;
-  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-  return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-}
+void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
+
+      simd_f_t r = srslte_simd_f_sub(a, b);
+
+      srslte_simd_f_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
+
+      simd_f_t r = srslte_simd_f_sub(a, b);
+
+      srslte_simd_f_storeu(&z[i], r);
+    }
+  }
 #endif
 
-
-#ifdef LV_HAVE_SSE
-static inline __m128 _mm_complexmulconj_ps(__m128 x, __m128 y) {
-  const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
-  y = _mm_xor_ps(y, conjugator); 
-  return _mm_complexmul_ps(x, y);
+  for (; i < len; i++) {
+    z[i] = x[i] - y[i];
+  }
 }
+
+cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len) {
+  int i = 0;
+  cf_t result = 0;
+
+#if SRSLTE_SIMD_CF_SIZE
+  __attribute__((aligned(64))) cf_t simd_dotProdVector[SRSLTE_SIMD_CF_SIZE];
+
+  simd_cf_t avx_result = srslte_simd_cf_zero();
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t xVal = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t yVal = srslte_simd_cfi_load(&y[i]);
+
+      avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result);
+      srslte_simd_cfi_store(simd_dotProdVector, avx_result);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]);
+      simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]);
+
+      avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result);
+      srslte_simd_cfi_storeu(simd_dotProdVector, avx_result);
+    }
+  }
+
+  srslte_simd_cfi_store(simd_dotProdVector, avx_result);
+  for (int k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) {
+    result += simd_dotProdVector[k];
+  }
 #endif
 
-cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len)
+  for (; i < len; i++) {
+    result += (x[i] * y[i]);
+  }
+
+  return result;
+}
+
+c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len) {
+  int i = 0;
+  c16_t result = 0;
+
+#if SRSLTE_SIMD_C16_SIZE
+  simd_c16_t avx_result = srslte_simd_c16_zero();
+
+  for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
+    simd_c16_t xVal = srslte_simd_c16i_load(&x[i]);
+    simd_c16_t yVal = srslte_simd_c16i_load(&y[i]);
+
+    avx_result = srslte_simd_c16_add(srslte_simd_c16_prod(xVal, yVal), avx_result);
+  }
+
+  __attribute__((aligned(256))) c16_t avx_dotProdVector[16] = {0};
+  srslte_simd_c16i_store(avx_dotProdVector, avx_result);
+  for (int k = 0; k < 16; k++) {
+    result += avx_dotProdVector[k];
+  }
+#endif
+
+  for(;i < len; i++){
+    result += (x[i] * y[i])/(1<<14);
+  }
+
+  return result;
+}
+
+cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len)
 {
-  cf_t result = 0; 
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 2;
+  int i = 0;
+  cf_t result = 0;
 
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  
-  __m128 dotProdVal = _mm_setzero_ps();
+#if SRSLTE_SIMD_CF_SIZE
+  __attribute__((aligned(256))) cf_t simd_dotProdVector[SRSLTE_SIMD_CF_SIZE];
 
-  __m128 xVal, yVal, zVal;
-  for(;number < points; number++){
+  simd_cf_t simd_result = srslte_simd_cf_zero();
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t xVal = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t yVal = srslte_simd_cfi_load(&y[i]);
 
-    xVal = _mm_loadu_ps(xPtr);
-    yVal = _mm_loadu_ps(yPtr);
+      simd_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), simd_result);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]);
+      simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]);
 
-    zVal = _mm_complexmul_ps(xVal, yVal);
-
-    dotProdVal = _mm_add_ps(dotProdVal, zVal);
-
-    xPtr += 4;
-    yPtr += 4;
-  }
-  
-  cf_t dotProdVector[2];
-  _mm_storeu_ps((float*) dotProdVector, dotProdVal);
-  for (int i=0;i<2;i++) {
-    result += dotProdVector[i]; 
+      simd_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), simd_result);
+    }
   }
 
-  number = points * 2;
-  for(;number < len; number++){
-    result += (x[number] * y[number]);
+  srslte_simd_cfi_store(simd_dotProdVector, simd_result);
+  for (int k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) {
+    result += simd_dotProdVector[k];
   }
-  
 #endif
-  return result; 
+
+  for (; i < len; i++) {
+    result += x[i] * conjf(y[i]);
+  }
+
+  return result;
 }
 
-cf_t srslte_vec_dot_prod_conj_ccc_sse(cf_t *x, cf_t *y, uint32_t len)
-{
-  cf_t result = 0; 
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 2;
+void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len) {
+  int i = 0;
 
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  
-  __m128 dotProdVal = _mm_setzero_ps();
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
 
-  __m128 xVal, yVal, zVal;
-  for(;number < points; number++){
+      simd_f_t r = srslte_simd_f_mul(a, b);
 
-    xVal = _mm_loadu_ps(xPtr);
-    yVal = _mm_loadu_ps(yPtr);
+      srslte_simd_f_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
 
-    zVal = _mm_complexmulconj_ps(xVal, yVal);
+      simd_f_t r = srslte_simd_f_mul(a, b);
 
-    dotProdVal = _mm_add_ps(dotProdVal, zVal);
-
-    xPtr += 4;
-    yPtr += 4;
+      srslte_simd_f_storeu(&z[i], r);
+    }
   }
-  
-  cf_t dotProdVector[2];
-  _mm_storeu_ps((float*) dotProdVector, dotProdVal);
-  for (int i=0;i<2;i++) {
-    result += dotProdVector[i]; 
-  }
-
-  number = points * 2;
-  for(;number < len; number++){
-    result += (x[number] * y[number]);
-  }
-  
 #endif
-  return result; 
+
+  for (; i<len; i++) {
+    z[i] = x[i] * y[i];
+  }
 }
 
-void srslte_vec_prod_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int halfPoints = len / 2;
+void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
+  int i = 0;
 
-  __m128 xVal, yVal, zVal;
-  float* zPtr = (float*) z;
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
+#if SRSLTE_SIMD_CF_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_load(&y[i]);
 
-  for(; number < halfPoints; number++){
-    xVal = _mm_loadu_ps(xPtr); 
-    yVal = _mm_loadu_ps(yPtr); 
-    zVal = _mm_complexmul_ps(xVal, yVal);
-    _mm_storeu_ps(zPtr, zVal); 
+      simd_cf_t r = srslte_simd_cf_prod(a, b);
 
-    xPtr += 4;
-    yPtr += 4;
-    zPtr += 4;
-  }
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
 
-  number = halfPoints * 2;
-  for(;number < len; number++){
-    z[number] = x[number] * y[number];
+      simd_cf_t r = srslte_simd_cf_prod(a, b);
+
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
   }
 #endif
+
+  for (; i<len; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+    simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
+    simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
+
+    simd_cf_t r = srslte_simd_cf_prod(a, b);
+
+    srslte_simd_cf_store(&r_re[i], &r_im[i], r);
+  }
+#endif
+
+  for (; i<len; i++) {
+    r_re[i] = a_re[i]*b_re[i] - a_im[i]*b_im[i];
+    r_im[i] = a_re[i]*b_im[i] + a_im[i]*b_re[i];
+  }
+}
+
+void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
+                                  int16_t *r_im, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_C16_SIZE
+  for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
+    simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
+    simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
+
+    simd_c16_t r = srslte_simd_c16_prod(a, b);
+
+    srslte_simd_c16_store(&r_re[i], &r_im[i], r);
+  }
+#endif
+
+  for (; i<len; i++) {
+    r_re[i] = a_re[i]*b_re[i] - a_im[i]*b_im[i];
+    r_im[i] = a_re[i]*b_im[i] + a_im[i]*b_re[i];
+  }
+}
+
+void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_load(&y[i]);
+
+      simd_cf_t r = srslte_simd_cf_conjprod(a, b);
+
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
+
+      simd_cf_t r = srslte_simd_cf_conjprod(a, b);
+
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i<len; i++) {
+    z[i] = x[i] * conjf(y[i]);
+  }
+}
+
+void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  const simd_f_t hre = srslte_simd_f_set1(__real__ h);
+  const simd_f_t him = srslte_simd_f_set1(__imag__ h);
+
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t temp = srslte_simd_f_load((float *) &x[i]);
+
+      simd_f_t m1 = srslte_simd_f_mul(hre, temp);
+      simd_f_t sw = srslte_simd_f_swap(temp);
+      simd_f_t m2 = srslte_simd_f_mul(him, sw);
+      simd_f_t r = srslte_simd_f_addsub(m1, m2);
+
+      srslte_simd_f_store((float *) &z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t temp = srslte_simd_f_load((float *) &x[i]);
+
+      simd_f_t m1 = srslte_simd_f_mul(hre, temp);
+      simd_f_t sw = srslte_simd_f_swap(temp);
+      simd_f_t m2 = srslte_simd_f_mul(him, sw);
+      simd_f_t r = srslte_simd_f_addsub(m1, m2);
+
+      srslte_simd_f_store((float *) &z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] * h;
+  }
+}
+
+void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  const simd_f_t hh = srslte_simd_f_set1(h);
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t xx = srslte_simd_f_load(&x[i]);
+
+      simd_f_t zz = srslte_simd_f_mul(xx, hh);
+
+      srslte_simd_f_store(&z[i], zz);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t xx = srslte_simd_f_loadu(&x[i]);
+
+      simd_f_t zz = srslte_simd_f_mul(xx, hh);
+
+      srslte_simd_f_storeu(&z[i], zz);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] * h;
+  }
+}
+
+void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+      z1 = srslte_simd_f_sqrt(z1);
+
+      srslte_simd_f_store(&z[i], z1);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+      z1 = srslte_simd_f_sqrt(z1);
+
+      srslte_simd_f_storeu(&z[i], z1);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = sqrtf(__real__(x[i]) * __real__(x[i]) + __imag__(x[i]) * __imag__(x[i]));
+  }
+}
+
+void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+
+      srslte_simd_f_store(&z[i], z1);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+
+      srslte_simd_f_storeu(&z[i], z1);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = __real__(x[i]) * __real__(x[i]) + __imag__(x[i]) * __imag__(x[i]);
+  }
 }
 
 
-void srslte_vec_prod_conj_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int halfPoints = len / 2;
+void srslte_vec_sc_prod_cfc_simd(const cf_t *x, const float h, cf_t *z, const int len) {
+  int i = 0;
 
-  __m128 xVal, yVal, zVal;
-  float* zPtr = (float*) z;
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
+#if SRSLTE_SIMD_F_SIZE
+  const simd_f_t tap = srslte_simd_f_set1(h);
 
-  for(; number < halfPoints; number++){
-    xVal = _mm_loadu_ps(xPtr); 
-    yVal = _mm_loadu_ps(yPtr); 
-    zVal = _mm_complexmulconj_ps(xVal, yVal);
-    _mm_storeu_ps(zPtr, zVal); 
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t temp = srslte_simd_f_load((float *) &x[i]);
 
-    xPtr += 4;
-    yPtr += 4;
-    zPtr += 4;
-  }
+      temp = srslte_simd_f_mul(tap, temp);
 
-  number = halfPoints * 2;
-  for(;number < len; number++){
-    z[number] = x[number] * conjf(y[number]);
+      srslte_simd_f_store((float *) &z[i], temp);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t temp = srslte_simd_f_loadu((float *) &x[i]);
+
+      temp = srslte_simd_f_mul(tap, temp);
+
+      srslte_simd_f_storeu((float *) &z[i], temp);
+    }
   }
 #endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] * h;
+  }
 }
 
-void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int halfPoints = len / 2;
+void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len) {
+  uint32_t i = 0;
 
-  __m128 xVal, yl, yh, zVal, tmp1, tmp2;
-  float* zPtr = (float*) z;
-  const float* xPtr = (const float*) x;
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(src) && SRSLTE_IS_ALIGNED(dst)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t temp = srslte_simd_f_load((float *) &src[i]);
 
-  // Set up constant scalar vector
-  yl = _mm_set_ps1(creal(h));
-  yh = _mm_set_ps1(cimag(h));
+      srslte_simd_f_store((float *) &dst[i], temp);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
+      simd_f_t temp = srslte_simd_f_loadu((float *) &src[i]);
 
-  for(;number < halfPoints; number++){
-
-    xVal = _mm_loadu_ps(xPtr); 
-    tmp1 = _mm_mul_ps(xVal,yl); 
-    xVal = _mm_shuffle_ps(xVal,xVal,0xB1); 
-    tmp2 = _mm_mul_ps(xVal,yh); 
-    zVal = _mm_addsub_ps(tmp1,tmp2); 
-    _mm_storeu_ps(zPtr,zVal); 
-
-    xPtr += 4;
-    zPtr += 4;
-  }
-
-  number = halfPoints * 2;
-  for(;number < len; number++){
-    z[number] = x[number] * h;
-  }
-#endif
-}
-
-
-void srslte_vec_sc_prod_cfc_sse(cf_t *x, float h, cf_t *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int halfPoints = len / 2;
-
-  __m128 xVal, hVal, zVal;
-  float* zPtr = (float*) z;
-  const float* xPtr = (const float*) x;
-
-  // Set up constant scalar vector
-  hVal = _mm_set_ps1(h);
-  
-  for(;number < halfPoints; number++){
-
-    xVal = _mm_loadu_ps(xPtr); 
-    zVal = _mm_mul_ps(xVal,hVal); 
-    _mm_storeu_ps(zPtr,zVal); 
-
-    xPtr += 4;
-    zPtr += 4;
-  }
-
-  number = halfPoints * 2;
-  for(;number < len; number++){
-    z[number] = x[number] * h;
-  }
-
-#endif
-}
-
-
-
-void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int quarterPoints = len / 4;
-
-  __m128 xVal, hVal, zVal;
-  float* zPtr = (float*) z;
-  const float* xPtr = (const float*) x;
-
-  // Set up constant scalar vector
-  hVal = _mm_set_ps1(h);
-  
-  for(;number < quarterPoints; number++){
-
-    xVal = _mm_loadu_ps(xPtr); 
-    zVal = _mm_mul_ps(xVal,hVal); 
-    _mm_storeu_ps(zPtr,zVal); 
-
-    xPtr += 4;
-    zPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < len; number++){
-    z[number] = x[number] * h;
-  }
-
-#endif
-}
-
-void srslte_vec_abs_square_cf_sse(cf_t *x, float *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int quarterPoints = len / 4;
-
-  const float* xPtr = (const float*) x;
-  float* zPtr = z;
-
-  __m128 xVal1, xVal2, zVal;
-  for(; number < quarterPoints; number++){
-    xVal1 = _mm_loadu_ps(xPtr);
-    xPtr += 4;
-    xVal2 = _mm_loadu_ps(xPtr);
-    xPtr += 4;
-    xVal1 = _mm_mul_ps(xVal1, xVal1); 
-    xVal2 = _mm_mul_ps(xVal2, xVal2); 
-    zVal = _mm_hadd_ps(xVal1, xVal2);
-    _mm_storeu_ps(zPtr, zVal);
-    zPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(;number < len; number++){
-    z[number] = creal(x[number]) * creal(x[number]) + cimag(x[number])*cimag(x[number]);
-  }
-#endif
-}
-
-
-//srslte_32fc_s32f_multiply_32fc_avx
- void srslte_vec_sc_prod_cfc_avx( const cf_t *x,const float h,cf_t *z,const uint32_t len)
-{
-#ifdef LV_HAVE_AVX
-   
-  unsigned int i = 0;
-  const unsigned int loops = len/4;
-  //__m256 outputVec;
-    cf_t *xPtr = (cf_t*) x;
-    cf_t *zPtr = (cf_t*) z;
-  
-  __m256 inputVec, outputVec;
-    const __m256 tapsVec  = _mm256_set1_ps(h);
-  for(;i < loops;i++)
-  {
-      inputVec  = _mm256_loadu_ps((float*)xPtr);
-      //__builtin_prefetch(xPtr+4);
-      outputVec = _mm256_mul_ps(inputVec,tapsVec);
-      _mm256_storeu_ps((float*)zPtr,outputVec);
-      xPtr += 4;
-      zPtr += 4;
-  }
-  
-  for(i = loops * 4;i < len;i++)
-  {
-      *zPtr++ = (*xPtr++) * h;
+      srslte_simd_f_storeu((float *) &dst[i], temp);
+    }
   }
 #endif
+
+  for (; i < len; i++) {
+    dst[i] = src[i];
+  }
 }

From c41ad5453c0ecb099829d771e5f549c471f55834 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Mon, 25 Sep 2017 17:08:11 +0200
Subject: [PATCH 21/55] Solved bugs and compilation error in simd and
 vector_simd

---
 lib/include/srslte/phy/utils/simd.h        | 20 ++++++++++----------
 lib/include/srslte/phy/utils/vector_simd.h |  2 +-
 lib/src/phy/utils/test/vector_test.c       |  8 ++++----
 lib/src/phy/utils/vector_simd.c            | 10 +++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 774dd54bd..22d8db79d 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -226,7 +226,7 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
 static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
   __m512 r = _mm512_add_ps(a, b);
-  return _mm512_mask_sub_ps(r, 0b1010101010101010, a, b);
+  return _mm512_mask_sub_ps(r, 0b0101010101010101, a, b);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
   return _mm256_addsub_ps(a, b);
@@ -642,10 +642,10 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
   return _mm512_load_si512(ptr);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
-  return _mm256_load_si256(ptr);
+  return _mm256_load_si256((__m256i*) ptr);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
-  return _mm_load_si128(ptr);
+  return _mm_load_si128((__m128i*) ptr);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -653,13 +653,13 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
 
 static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) {
 #ifdef LV_HAVE_AVX512
-  return _mm512_load_si512(ptr);
+  return _mm512_loadu_si512(ptr);
 #else /* LV_HAVE_AVX512 */
   #ifdef LV_HAVE_AVX2
-  return _mm256_load_si256(ptr);
+  return _mm256_loadu_si256((__m256i*) ptr);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
-  return _mm_load_si128(ptr);
+  return _mm_loadu_si128((__m128i*) ptr);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -670,10 +670,10 @@ static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) {
   _mm512_store_si512(ptr, simdreg);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
-  _mm256_store_si256(ptr, simdreg);
+  _mm256_store_si256((__m256i*) ptr, simdreg);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
-  _mm_store_si128(ptr, simdreg);
+  _mm_store_si128((__m128i*) ptr, simdreg);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -684,10 +684,10 @@ static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) {
   _mm512_storeu_si512(ptr, simdreg);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
-  _mm256_storeu_si256(ptr, simdreg);
+  _mm256_storeu_si256((__m256i*) ptr, simdreg);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
-  _mm_storeu_si128(ptr, simdreg);
+  _mm_storeu_si128((__m128i*) ptr, simdreg);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h
index 8ea2ce9bc..4ee839fab 100644
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@@ -44,7 +44,7 @@ extern "C" {
 #ifdef LV_HAVE_SSE
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
 #else /* LV_HAVE_SSE */
-#define SRSLTE_IS_ALIGNED(PTR) (true)
+#define SRSLTE_IS_ALIGNED(PTR) (1)
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX */
 #endif /* LV_HAVE_AVX512 */
diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c
index e781d05b9..05dce1d35 100644
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@@ -45,7 +45,7 @@ bool mmse_solver = false;
 bool verbose = false;
 
 #define MAX_MSE (1e-3)
-#define NOF_REPETITIONS (1024*128)
+#define NOF_REPETITIONS (1024)
 #define MAX_FUNCTIONS (64)
 #define MAX_BLOCKS (16)
 
@@ -70,7 +70,7 @@ bool verbose = false;
     return passed;\
 }
 
-#define MALLOC(TYPE, NAME) TYPE *NAME = srslte_vec_malloc(sizeof(TYPE)*block_size)
+#define MALLOC(TYPE, NAME) TYPE *NAME = malloc(sizeof(TYPE)*block_size)
 
 
 static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
@@ -339,7 +339,7 @@ TEST(srslte_vec_prod_conj_ccc,
 TEST(srslte_vec_sc_prod_ccc,
   MALLOC(cf_t, x);
   MALLOC(cf_t, z);
-  cf_t y = RANDOM_F();
+  cf_t y = RANDOM_CF();
 
   cf_t gold;
   for (int i = 0; i < block_size; i++) {
@@ -469,7 +469,7 @@ int main(int argc, char **argv) {
   uint32_t func_count = 0;
   bool passed = true;
 
-  for (uint32_t block_size = 1; block_size <= 1024*16; block_size *= 2) {
+  for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) {
     func_count = 0;
 
     passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index 21132390f..2eb0428b7 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -77,7 +77,7 @@ int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len) {
 
 void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
   int i = 0;
-#ifdef SRSLTE_SIMD_S_SIZE
+#if SRSLTE_SIMD_S_SIZE
   if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
     for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
       simd_s_t a = srslte_simd_s_load(&x[i]);
@@ -106,7 +106,7 @@ void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
 
 void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
   int i = 0;
-#ifdef SRSLTE_SIMD_S_SIZE
+#if SRSLTE_SIMD_S_SIZE
   if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
     for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
       simd_s_t a = srslte_simd_s_load(&x[i]);
@@ -135,7 +135,7 @@ void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
 
 void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
   int i = 0;
-#ifdef SRSLTE_SIMD_S_SIZE
+#if SRSLTE_SIMD_S_SIZE
   if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
     for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
       simd_s_t a = srslte_simd_s_load(&x[i]);
@@ -721,14 +721,14 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
     }
   } else {
     for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
-      simd_f_t temp = srslte_simd_f_load((float *) &x[i]);
+      simd_f_t temp = srslte_simd_f_loadu((float *) &x[i]);
 
       simd_f_t m1 = srslte_simd_f_mul(hre, temp);
       simd_f_t sw = srslte_simd_f_swap(temp);
       simd_f_t m2 = srslte_simd_f_mul(him, sw);
       simd_f_t r = srslte_simd_f_addsub(m1, m2);
 
-      srslte_simd_f_store((float *) &z[i], r);
+      srslte_simd_f_storeu((float *) &z[i], r);
     }
   }
 #endif

From 9e5f999666a97b82fa0558b28294a7b9f62becbf Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Thu, 28 Sep 2017 11:04:26 +0200
Subject: [PATCH 22/55] Added more functions

---
 lib/include/srslte/phy/utils/simd.h        | 237 ++++++++++
 lib/include/srslte/phy/utils/vector.h      |   5 +-
 lib/include/srslte/phy/utils/vector_simd.h |  63 +--
 lib/src/phy/mimo/precoding.c               |   3 +-
 lib/src/phy/sync/find_sss.c                |   8 +-
 lib/src/phy/utils/test/CMakeLists.txt      |   3 +-
 lib/src/phy/utils/test/vector_test.c       | 287 +++++++++++-
 lib/src/phy/utils/vector.c                 | 134 +-----
 lib/src/phy/utils/vector_simd.c            | 493 ++++++++++++++++-----
 9 files changed, 939 insertions(+), 294 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 22d8db79d..9a5f15dbb 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -88,6 +88,8 @@
 #define SRSLTE_SIMD_F_SIZE    16
 #define SRSLTE_SIMD_CF_SIZE   16
 
+#define SRSLTE_SIMD_I_SIZE    16
+
 #define SRSLTE_SIMD_S_SIZE    32
 #define SRSLTE_SIMD_C16_SIZE  0
 
@@ -97,6 +99,8 @@
 #define SRSLTE_SIMD_F_SIZE    8
 #define SRSLTE_SIMD_CF_SIZE   8
 
+#define SRSLTE_SIMD_I_SIZE    8
+
 #define SRSLTE_SIMD_S_SIZE    16
 #define SRSLTE_SIMD_C16_SIZE  16
 
@@ -106,6 +110,8 @@
 #define SRSLTE_SIMD_F_SIZE    4
 #define SRSLTE_SIMD_CF_SIZE   4
 
+#define SRSLTE_SIMD_I_SIZE    4
+
 #define SRSLTE_SIMD_S_SIZE    8
 #define SRSLTE_SIMD_C16_SIZE  8
 
@@ -114,6 +120,8 @@
 #define SRSLTE_SIMD_F_SIZE    0
 #define SRSLTE_SIMD_CF_SIZE   0
 
+#define SRSLTE_SIMD_I_SIZE    0
+
 #define SRSLTE_SIMD_S_SIZE    0
 #define SRSLTE_SIMD_C16_SIZE  0
 
@@ -223,6 +231,20 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
 #endif /* LV_HAVE_AVX512 */
 }
 
+static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_rcp_ps(a);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_rcp_ps(a);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return _mm_rcp_ps(a);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
 static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
   __m512 r = _mm512_add_ps(a, b);
@@ -600,6 +622,61 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
   return ret;
 }
 
+static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
+    simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  b = _mm512_permutexvar_ps(b, _mm512_setr_epi32(0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
+  ret.re = _mm512_mul_ps(a.re, b);
+  ret.im = _mm512_mul_ps(a.im, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  b = _mm256_permutevar8x32_ps(b, _mm256_setr_epi32(0,4,1,5,2,6,3,7));
+  ret.re = _mm256_mul_ps(a.re, b);
+  ret.im = _mm256_mul_ps(a.im, b);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  ret.re = _mm_mul_ps(a.re, b);
+  ret.im = _mm_mul_ps(a.im, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+    return ret;
+}
+
+static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
+  simd_cf_t ret;
+#ifdef LV_HAVE_AVX512
+  simd_f_t a2re = _mm512_mul_ps(a.re, a.re);
+  simd_f_t a2im = _mm512_mul_ps(a.im, a.im);
+  simd_f_t mod2 = _mm512_add_ps(a2re, a2im);
+  simd_f_t rcp = _mm512_rcp_ps(mod2);
+  simd_f_t neg_a_im = _mm512_xor_ps(_mm512_set1_ps(-0.0f), a.im);
+  ret.re = _mm512_mul_ps(a.re, rcp);
+  ret.im = _mm512_mul_ps(neg_a_im, rcp);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  simd_f_t a2re = _mm256_mul_ps(a.re, a.re);
+  simd_f_t a2im = _mm256_mul_ps(a.im, a.im);
+  simd_f_t mod2 = _mm256_add_ps(a2re, a2im);
+  simd_f_t rcp = _mm256_rcp_ps(mod2);
+  simd_f_t neg_a_im = _mm256_xor_ps(_mm256_set1_ps(-0.0f), a.im);
+  ret.re = _mm256_mul_ps(a.re, rcp);
+  ret.im = _mm256_mul_ps(neg_a_im, rcp);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  simd_f_t a2re = _mm_mul_ps(a.re, a.re);
+  simd_f_t a2im = _mm_mul_ps(a.im, a.im);
+  simd_f_t mod2 = _mm_add_ps(a2re, a2im);
+  simd_f_t rcp = _mm_rcp_ps(mod2);
+  simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im);
+  ret.re = _mm_mul_ps(a.re, rcp);
+  ret.im = _mm_mul_ps(neg_a_im, rcp);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+  return ret;
+}
+
 static inline simd_cf_t srslte_simd_cf_zero (void) {
   simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
@@ -621,6 +698,106 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {
 
 #endif /* SRSLTE_SIMD_CF_SIZE */
 
+#if SRSLTE_SIMD_I_SIZE
+
+#ifdef LV_HAVE_AVX512
+typedef __m512i simd_i_t;
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+typedef __m256i simd_i_t;
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+typedef __m128i simd_i_t;
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+
+static inline simd_i_t srslte_simd_i_load(int *x) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_load_epi32((__m512i*)x);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_load_si256((__m256i*)x);
+#else
+  #ifdef LV_HAVE_SSE
+  return _mm_load_si128((__m128i*)x);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline void srslte_simd_i_store(int *x, simd_i_t reg) {
+#ifdef LV_HAVE_AVX512
+  _mm512_store_epi32((__m512i*)x, reg);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  _mm256_store_si256((__m256i*)x, reg);
+#else
+#ifdef LV_HAVE_SSE
+  _mm_store_si128((__m128i*)x, reg);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_i_set1(int x) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_set1_epi32(x);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_set1_epi32(x);
+#else
+  #ifdef LV_HAVE_SSE
+  return _mm_set1_epi32(x);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_add_epi32(a, b);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_add_epi32(a, b);
+#else
+#ifdef LV_HAVE_SSE
+  return _mm_add_epi32(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  return (simd_i_t) _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return (simd_i_t) _mm256_cmp_ps(a, b, _CMP_GT_OS);
+#else /* LV_HAVE_AVX2 */
+  #ifdef LV_HAVE_SSE
+  return  (simd_i_t) _mm_cmpgt_ps(a, b);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_i_t selector) {
+#ifdef LV_HAVE_AVX512
+  return (__m512i) _mm512_blendv_ps((__m512)a, (__m512) b, (__m512) selector);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b,(__m256) selector);
+#else
+  #ifdef LV_HAVE_SSE
+  return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, (__m128)selector);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+#endif /* SRSLTE_SIMD_I_SIZE*/
+
 
 #if SRSLTE_SIMD_S_SIZE
 
@@ -829,6 +1006,20 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) {
   return ret;
 }
 
+static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) {
+  simd_c16_t ret;
+#ifdef LV_HAVE_AVX2
+  ret.re.m256 = _mm256_loadu_si256((__m256i*)(re));
+  ret.im.m256 = _mm256_loadu_si256((__m256i*)(im));
+#else
+#ifdef LV_HAVE_SSE
+  ret.re.m128 = _mm_loadu_si128((__m128i*)(re));
+  ret.im.m128 = _mm_loadu_si128((__m128i*)(im));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+  return ret;
+}
+
 static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
   __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
@@ -845,6 +1036,22 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
 #endif /* LV_HAVE_AVX2 */
 }
 
+static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) {
+#ifdef LV_HAVE_AVX2
+  __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
+  __m256i im_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.im.m256, 0b10110001), 0b10110001);
+  _mm256_storeu_si256((__m256i *) (ptr), _mm256_blend_epi16(simdreg.re.m256, im_sw, 0b10101010));
+  _mm256_storeu_si256((__m256i *) (ptr + 8), _mm256_blend_epi16(re_sw, simdreg.im.m256, 0b10101010));
+#else
+#ifdef LV_HAVE_SSE
+  __m128i re_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.re.m128, 0b10110001), 0b10110001);
+  __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
+  _mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
+  _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
 static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
   _mm256_store_si256((__m256i *) re, simdreg.re.m256);
@@ -857,6 +1064,18 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si
 #endif /* LV_HAVE_AVX2 */
 }
 
+static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t simdreg) {
+#ifdef LV_HAVE_AVX2
+  _mm256_storeu_si256((__m256i *) re, simdreg.re.m256);
+  _mm256_storeu_si256((__m256i *) im, simdreg.im.m256);
+#else
+#ifdef LV_HAVE_SSE
+  _mm_storeu_si128((__m128i *) re, simdreg.re.m128);
+  _mm_storeu_si128((__m128i *) im, simdreg.im.m128);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
 static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
   simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
@@ -905,6 +1124,24 @@ static inline simd_c16_t srslte_simd_c16_zero (void) {
 
 #endif /* SRSLTE_SIMD_C16_SIZE */
 
+#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
 
+static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX2
+  __m256 aa = _mm256_permute2f128_ps(a, b, 0x20);
+  __m256 bb = _mm256_permute2f128_ps(a, b, 0x31);
+  __m256i ai = _mm256_cvttps_epi32(aa);
+  __m256i bi = _mm256_cvttps_epi32(bb);
+  return _mm256_packs_epi32(ai, bi);
+#else
+#ifdef LV_HAVE_SSE
+  __m128i ai = _mm_cvttps_epi32(a);
+  __m128i bi = _mm_cvttps_epi32(b);
+  return _mm_packs_epi32(ai, bi);
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+}
+
+#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_C16_SIZE */
 
 #endif //SRSLTE_SIMD_H_H
diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h
index 0fadfb334..9b99c6fff 100644
--- a/lib/include/srslte/phy/utils/vector.h
+++ b/lib/include/srslte/phy/utils/vector.h
@@ -123,6 +123,7 @@ SRSLTE_API void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint
 
 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len);
 
 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
@@ -142,8 +143,8 @@ SRSLTE_API float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len);
 SRSLTE_API int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len); 
 
 /* z=x/y vector division (element-wise) */
-SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len);
-void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len);
+SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len);
 
 /* conjugate */
diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h
index 4ee839fab..294cff50f 100644
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@@ -36,26 +36,29 @@ extern "C" {
 #include "srslte/config.h"
 
 #ifdef LV_HAVE_AVX512
+#define SRSLTE_SIMD_BIT_ALIGN 512
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0)
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX
+#define SRSLTE_SIMD_BIT_ALIGN 256
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0)
 #else /* LV_HAVE_AVX */
 #ifdef LV_HAVE_SSE
+#define SRSLTE_SIMD_BIT_ALIGN 128
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
 #else /* LV_HAVE_SSE */
+#define SRSLTE_SIMD_BIT_ALIGN 64
 #define SRSLTE_IS_ALIGNED(PTR) (1)
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX */
 #endif /* LV_HAVE_AVX512 */
 
-SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
-
+/* SIMD Basic vector math */
 SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
 
 SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
 
-SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API float srslte_vec_acc_ff_simd(float *x, int len);
 
 SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len);
 
@@ -63,59 +66,63 @@ SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len);
 
 SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len);
 
+/* SIMD Vector Scalar Product */
+SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
+
 SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len);
 
 SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len);
 
+/* SIMD Vector Product */
+SRSLTE_API void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
+
+SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
+                                             int16_t *r_im, int len);
+
+SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
+
+SRSLTE_API void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
+
 SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len);
 
 SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
 
 SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
 
-SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
+/* SIMD Division */
+SRSLTE_API void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
 
-SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
-                                             int16_t *r_im, int len);
+SRSLTE_API void srslte_vec_div_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
 
-SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
+SRSLTE_API void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len);
 
+/* SIMD Dot product */
 SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len);
 
 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len);
 
-SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
-
 SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len);
 
-SRSLTE_API  void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);
+SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
 
+/* SIMD Modulus functions */
 SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len);
 
 SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len);
 
-SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
+/* Other Functions */
+SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len);
 
-SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len); 
-
-SRSLTE_API  void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len);
-
-SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); 
-
-SRSLTE_API void srslte_vec_mult_scalar_cf_f_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len);
-
-SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len);
-
-SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len);
-
-SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
+SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len);
 
 SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len);
 
+
+/* SIMD Find Max functions */
+SRSLTE_API uint32_t srslte_vec_max_fi_simd(float *x, int len);
+
+SRSLTE_API uint32_t srslte_vec_max_ci_simd(cf_t *x, int len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/src/phy/mimo/precoding.c b/lib/src/phy/mimo/precoding.c
index f1aab3b5d..8f8bd7737 100644
--- a/lib/src/phy/mimo/precoding.c
+++ b/lib/src/phy/mimo/precoding.c
@@ -36,17 +36,16 @@
 
 #ifdef LV_HAVE_SSE
 #include <immintrin.h>
-#include "srslte/phy/utils/mat.h"
 int srslte_predecoding_single_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 int srslte_predecoding_diversity2_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_symbols);
 #endif
 
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>
-#include "srslte/phy/utils/mat.h"
 int srslte_predecoding_single_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 #endif
 
+#include "srslte/phy/utils/mat.h"
 
 static srslte_mimo_decoder_t mimo_decoder = SRSLTE_MIMO_DECODER_MMSE;
 
diff --git a/lib/src/phy/sync/find_sss.c b/lib/src/phy/sync/find_sss.c
index 2afeced42..082aee52a 100644
--- a/lib/src/phy/sync/find_sss.c
+++ b/lib/src/phy/sync/find_sss.c
@@ -70,14 +70,12 @@ static void corr_all_sz_partial(cf_t z[SRSLTE_SSS_N], float s[SRSLTE_SSS_N][SRSL
 
 static void extract_pair_sss(srslte_sss_synch_t *q, cf_t *input, cf_t *ce, cf_t y[2][SRSLTE_SSS_N]) {
   cf_t input_fft[SRSLTE_SYMBOL_SZ_MAX];
-  float ce_mod[2*SRSLTE_SSS_N], z_real[2*SRSLTE_SSS_N], z_imag[2*SRSLTE_SSS_N];
-  
+
   srslte_dft_run_c(&q->dftp_input, input, input_fft);
   
   if (ce) {
-    srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce, ce_mod, 
-                       &input_fft[q->fft_size/2-SRSLTE_SSS_N], z_real, z_imag,
-                       2*SRSLTE_SSS_N);
+    srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce,
+                       &input_fft[q->fft_size/2-SRSLTE_SSS_N], 2*SRSLTE_SSS_N);
   }
   
   for (int i = 0; i < SRSLTE_SSS_N; i++) {
diff --git a/lib/src/phy/utils/test/CMakeLists.txt b/lib/src/phy/utils/test/CMakeLists.txt
index 76df7ac59..1f5c66827 100644
--- a/lib/src/phy/utils/test/CMakeLists.txt
+++ b/lib/src/phy/utils/test/CMakeLists.txt
@@ -44,4 +44,5 @@ add_test(algebra_2x2_zf_solver_test algebra_test -z)
 add_test(algebra_2x2_mmse_solver_test algebra_test -m)
 
 add_executable(vector_test vector_test.c)
-target_link_libraries(vector_test srslte_phy)
\ No newline at end of file
+target_link_libraries(vector_test srslte_phy)
+add_test(vector_test vector_test)
diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c
index 05dce1d35..cf0f38926 100644
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@@ -89,6 +89,26 @@ float squared_error (cf_t a, cf_t b) {
   return diff_re*diff_re + diff_im*diff_im;
 }
 
+TEST(srslte_vec_acc_ff,
+     MALLOC(float, x);
+         float z;
+
+         cf_t gold = 0.0f;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+         }
+
+         TEST_CALL(z = srslte_vec_acc_ff(x, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold += x[i];
+         }
+
+         mse += fabs(gold - z) / gold;
+
+         free(x);
+)
+
 TEST(srslte_vec_dot_prod_sss,
      MALLOC(int16_t, x);
          MALLOC(int16_t, y);
@@ -314,6 +334,37 @@ TEST(srslte_vec_prod_ccc,
   free(z);
 )
 
+TEST(srslte_vec_prod_ccc_split,
+     MALLOC(float, x_re);
+     MALLOC(float, x_im);
+     MALLOC(float, y_re);
+     MALLOC(float, y_im);
+     MALLOC(float, z_re);
+     MALLOC(float, z_im);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x_re[i] = RANDOM_F();
+           x_im[i] = RANDOM_F();
+           y_re[i] = RANDOM_F();
+           y_im[i] = RANDOM_F();
+         }
+
+         TEST_CALL(srslte_vec_prod_ccc_split(x_re, x_im, y_re, y_im, z_re, z_im, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = (x_re[i] + I * x_im[i]) * (y_re[i] + I * y_im[i]);
+           mse += cabsf(gold - (z_re[i] + I*z_im[i]));
+         }
+
+         free(x_re);
+         free(x_im);
+         free(y_re);
+         free(y_im);
+         free(z_re);
+         free(z_im);
+)
+
 TEST(srslte_vec_prod_conj_ccc,
   MALLOC(cf_t, x);
   MALLOC(cf_t, y);
@@ -357,6 +408,27 @@ TEST(srslte_vec_sc_prod_ccc,
   free(z);
 )
 
+TEST(srslte_vec_convert_fi,
+  MALLOC(float, x);
+  MALLOC(short, z);
+      float scale = 1000.0f;
+
+  short gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = (float) RANDOM_F();
+  }
+
+  TEST_CALL(srslte_vec_convert_fi(x, z, scale, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+      gold = (short) ((x[i] * scale));
+      mse += cabsf((float)gold - (float) z[i]);
+  }
+
+  free(x);
+  free(z);
+)
+
 TEST(srslte_vec_prod_fff,
   MALLOC(float, x);
   MALLOC(float, y);
@@ -376,6 +448,30 @@ TEST(srslte_vec_prod_fff,
   }
 
   free(x);
+  free(y);
+  free(z);
+)
+
+TEST(srslte_vec_prod_cfc,
+  MALLOC(cf_t, x);
+  MALLOC(float, y);
+  MALLOC(cf_t, z);
+
+  cf_t gold;
+  for (int i = 0; i < block_size; i++) {
+    x[i] = RANDOM_CF();
+    y[i] = RANDOM_F();
+  }
+
+  TEST_CALL(srslte_vec_prod_cfc(x, y, z, block_size))
+
+  for (int i = 0; i < block_size; i++) {
+    gold = x[i] * y[i];
+    mse += cabsf(gold - z[i]);
+  }
+
+  free(x);
+  free(y);
   free(z);
 )
 
@@ -461,66 +557,216 @@ TEST(srslte_vec_sc_prod_cfc,
   free(z);
 )
 
+TEST(srslte_vec_div_ccc,
+     MALLOC(cf_t, x);
+         MALLOC(cf_t, y);
+         MALLOC(cf_t, z);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+           y[i] = RANDOM_CF();
+         }
+
+         TEST_CALL(srslte_vec_div_ccc(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] / y[i];
+           mse += cabsf(gold - z[i]);
+         }
+         mse /= block_size;
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+
+TEST(srslte_vec_div_cfc,
+     MALLOC(cf_t, x);
+         MALLOC(float, y);
+         MALLOC(cf_t, z);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+           y[i] = RANDOM_F();
+         }
+
+         TEST_CALL(srslte_vec_div_cfc(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] / y[i];
+           mse += cabsf(gold - z[i])/cabsf(gold);
+         }
+         mse /= block_size;
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+
+TEST(srslte_vec_div_fff,
+     MALLOC(float, x);
+         MALLOC(float, y);
+         MALLOC(float, z);
+
+         cf_t gold;
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+           y[i] = RANDOM_F();
+         }
+
+         TEST_CALL(srslte_vec_div_fff(x, y, z, block_size))
+
+         for (int i = 0; i < block_size; i++) {
+           gold = x[i] / y[i];
+           mse += cabsf(gold - z[i]);
+         }
+         mse /= block_size;
+
+         free(x);
+         free(y);
+         free(z);
+)
+
+TEST(srslte_vec_max_fi,
+     MALLOC(float, x);
+
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+         }
+
+         uint32_t max_index = 0;
+         TEST_CALL(max_index = srslte_vec_max_fi(x, block_size);)
+
+         float gold_value = -INFINITY;
+         uint32_t gold_index = 0;
+         for (int i = 0; i < block_size; i++) {
+           if (gold_value < x[i]) {
+             gold_value = x[i];
+             gold_index = i;
+           }
+         }
+         mse = (gold_index != max_index) ? 1:0;
+
+         free(x);
+)
+
+TEST(srslte_vec_max_abs_ci,
+     MALLOC(cf_t, x);
+
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_CF();
+         }
+
+         uint32_t max_index = 0;
+         TEST_CALL(max_index = srslte_vec_max_abs_ci(x, block_size);)
+
+         float gold_value = -INFINITY;
+         uint32_t gold_index = 0;
+         for (int i = 0; i < block_size; i++) {
+           cf_t a = x[i];
+           float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
+           if (abs2 > gold_value) {
+             gold_value = abs2;
+             gold_index = (uint32_t)i;
+           }
+         }
+         mse = (gold_index != max_index) ? 1:0;
+
+         free(x);
+)
+
 int main(int argc, char **argv) {
   char func_names[MAX_FUNCTIONS][32];
   double timmings[MAX_FUNCTIONS][MAX_BLOCKS];
   uint32_t sizes[32];
   uint32_t size_count = 0;
   uint32_t func_count = 0;
-  bool passed = true;
+  bool passed[MAX_FUNCTIONS][MAX_BLOCKS];
+  bool all_passed = true;
 
   for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) {
     func_count = 0;
 
-    passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_acc_ff(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_convert_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_ccc_split(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
-    passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_div_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_div_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_div_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
+    passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size);
     func_count++;
 
     sizes[size_count] = block_size;
@@ -546,10 +792,11 @@ int main(int argc, char **argv) {
   for (int i = 0; i < func_count; i++) {
     printf("%32s | ", func_names[i]);
     for (int j = 0; j < size_count; j++) {
-      printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+      printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+      all_passed &= passed[i][j];
     }
     printf(" |\n");
   }
 
-  return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
+  return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
 }
diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c
index cb21f24f1..f85dbca0a 100644
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@@ -48,18 +48,7 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {
 
 // Used in PRACH detector, AGC and chest_dl for noise averaging
 float srslte_vec_acc_ff(float *x, uint32_t len) {
-#ifdef HAVE_VOLK_ACC_FUNCTION
-  float result;
-  volk_32f_accumulator_s32f(&result,x,len);
-  return result;
-#else
-   int i;
-   float z=0;
-   for (i=0;i<len;i++) {
-     z+=x[i];
-   }
-   return z;
-#endif
+  return srslte_vec_acc_ff_simd(x, len);
 }
 
 void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len) {
@@ -190,14 +179,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
 
 // Chest UL 
 void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*h;
-  }
-#else
   srslte_vec_sc_prod_ccc_simd(x,h,z,len);
-#endif
 }
 
 // Used in turbo decoder 
@@ -217,14 +199,7 @@ void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len) {
 }
 
 void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = (int16_t) (x[i]*scale);
-  }
-#else 
-  srslte_vec_convert_fi_sse(x, z, scale, len);
-#endif
+  srslte_vec_convert_fi_simd(x, z, scale, len);
 }
 
 void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
@@ -234,13 +209,7 @@ void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
 }
 
 void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
-#ifndef LV_HAVE_SSE
-  for (int i=0;i<len;i++) {
-    y[lut[i]] = x[i];
-  }
-#else
-  srslte_vec_lut_sss_sse(x, lut, y, len);
-#endif
+  srslte_vec_lut_sss_simd(x, lut, y, len);
 }
 
 void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
@@ -280,7 +249,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
  */
 void *srslte_vec_malloc(uint32_t size) {
   void *ptr;
-  if (posix_memalign(&ptr,512,size)) {
+  if (posix_memalign(&ptr, SRSLTE_SIMD_BIT_ALIGN, size)) {
     return NULL;
   } else {
     return ptr;
@@ -292,7 +261,7 @@ void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
   return realloc(ptr, new_size);
 #else
   void *new_ptr;
-  if (posix_memalign(&new_ptr,256,new_size)) {
+  if (posix_memalign(&new_ptr, SRSLTE_SIMD_BIT_ALIGN, new_size)) {
     return NULL;
   } else {
     memcpy(new_ptr, ptr, old_size);
@@ -415,6 +384,7 @@ void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) {
 
 // Used in PSS
 void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
+  /* This function is used in initialisation only, then no optimisation is required */
   int i;
   for (i=0;i<len;i++) {
     y[i] = conjf(x[i]);
@@ -423,10 +393,7 @@ void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
 
 // Used in scrambling complex 
 void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*y[i];
-  }
+  srslte_vec_prod_cfc_simd(x, y, z, len);
 }
 
 // Used in scrambling float
@@ -444,6 +411,10 @@ void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
   srslte_vec_prod_ccc_simd(x,y,z,len);
 }
 
+void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len) {
+  srslte_vec_prod_ccc_split_simd(x_re, x_im, y_re , y_im, z_re,z_im, len);
+}
+
 // PRACH, CHEST UL, etc. 
 void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
   srslte_vec_prod_conj_ccc_simd(x,y,z,len);
@@ -452,40 +423,17 @@ void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 //#define DIV_USE_VEC
 
 // Used in SSS 
-/* Complex division is conjugate multiplication + real division */
-void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
-#ifdef DIV_USE_VEC
-  srslte_vec_prod_conj_ccc(x,y,z,len);
-  srslte_vec_abs_square_cf(y,y_mod,len);
-  srslte_vec_div_cfc(z,y_mod,z,z_real,z_imag,len);  
-#else 
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i] / y[i]; 
-  }
-#endif
+void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
+  srslte_vec_div_ccc_simd(x, y, z, len);
 }
 
 /* Complex division by float z=x/y */
-void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
-#ifdef DIV_USE_VEC
-  srslte_vec_deinterleave_cf(x, z_real, z_imag, len);
-  srslte_vec_div_fff(z_real, y, z_real, len);
-  srslte_vec_div_fff(z_imag, y, z_imag, len);
-  srslte_vec_interleave_cf(z_real, z_imag, z, len);
-#else
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i] / y[i]; 
-  }
-#endif
+void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
+  srslte_vec_div_cfc_simd(x, y, z, len);
 }
 
 void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i] / y[i];
-  }
+  srslte_vec_div_fff_simd(x, y, z, len);
 }
 
 // PSS. convolution 
@@ -554,30 +502,7 @@ void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
 }
 
 uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
-
-  // This is to solve an issue with incorrect type of 1st parameter in version 1.2 of volk
-#ifdef HAVE_VOLK_MAX_FUNCTION_32
-  uint32_t target=0;
-  volk_32f_index_max_32u(&target,x,len);
-  return target;
-#else
-#ifdef HAVE_VOLK_MAX_FUNCTION_16
-  uint32_t target=0;
-  volk_32f_index_max_16u(&target,x,len);
-  return target;
-#else
-  uint32_t i;
-  float m=-FLT_MAX;
-  uint32_t p=0;
-  for (i=0;i<len;i++) {
-    if (x[i]>m) {
-      m=x[i];
-      p=i;
-    }
-  }
-  return p;
-#endif
-#endif
+  return srslte_vec_max_fi_simd(x, len);
 }
 
 int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
@@ -616,30 +541,7 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {
 
 // CP autocorr
 uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_32
-  uint32_t target=0;
-  volk_32fc_index_max_32u(&target,x,len);
-  return target;
-#else
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_16
-  uint32_t target=0;
-  volk_32fc_index_max_16u(&target,x,len);
-  return target;
-#else
-  uint32_t i;
-  float m=-FLT_MAX;
-  uint32_t p=0;
-  float tmp;
-  for (i=0;i<len;i++) {
-    tmp = crealf(x[i])*crealf(x[i]) + cimagf(x[i])*cimagf(x[i]);
-    if (tmp>m) {
-      m=tmp;
-      p=i;
-    }
-  }
-  return p;
-#endif
-#endif
+  return srslte_vec_max_ci_simd(x, len);
 }
 
 void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) {
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index 2eb0428b7..2dd354548 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -232,143 +232,113 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
 
 
 /* No improvement with AVX */
-void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
-{
-#ifdef DEBUG_MODE
-  for (int i=0;i<len;i++) {
+void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len) {
+  int i = 0;
+#ifdef LV_HAVE_SSE
+#if CMAKE_BUILD_TYPE!=Debug
+
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
+    for (; i < len - 7; i += 8) {
+      __m128i xVal = _mm_load_si128((__m128i *) &x[i]);
+      __m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
+
+      for (int k = 0; k < 8; k++) {
+        int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
+        uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
+        y[l] = (short) x;
+      }
+    }
+  } else {
+    for (; i < len - 7; i += 8) {
+      __m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
+      __m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
+
+      for (int k = 0; k < 8; k++) {
+        int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
+        uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
+        y[l] = (short) x;
+      }
+    }
+  }
+#endif
+#endif
+
+  for (; i < len; i++) {
     y[lut[i]] = x[i];
   }
-#else
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const __m128i* xPtr = (const __m128i*) x;
-  const __m128i* lutPtr = (__m128i*) lut;
-
-  __m128i xVal, lutVal;
-  for(;number < points; number++){
-
-    xVal   = _mm_loadu_si128(xPtr);
-    lutVal = _mm_loadu_si128(lutPtr);
-    
-    for (int i=0;i<8;i++) {
-      int16_t x = (int16_t)   _mm_extract_epi16(xVal, i); 
-      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
-      y[l] = x;
-    }
-    xPtr ++;
-    lutPtr ++;
-  }
-
-  number = points * 8;
-  for(;number < len; number++){
-    y[lut[number]] = x[number];
-  }
-#endif  
-#endif
 }
 
 /* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
-void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
+void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len) {
+  int i = 0;
 
-  const unsigned int eighthPoints = len / 8;
+#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
+  simd_f_t s = srslte_simd_f_set1(scale);
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&x[i + SRSLTE_SIMD_F_SIZE]);
 
-  const float* inputVectorPtr = (const float*)x;
-  int16_t* outputVectorPtr = z;
+      simd_f_t sa = srslte_simd_f_mul(a, s);
+      simd_f_t sb = srslte_simd_f_mul(b, s);
 
-  __m128 vScalar = _mm_set_ps1(scale);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-  __m128 ret1, ret2;
+      simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
 
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+      srslte_simd_s_store(&z[i], i16);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&x[i + SRSLTE_SIMD_F_SIZE]);
 
-    ret1 = _mm_mul_ps(inputVal1, vScalar);
-    ret2 = _mm_mul_ps(inputVal2, vScalar);
+      simd_f_t sa = srslte_simd_f_mul(a, s);
+      simd_f_t sb = srslte_simd_f_mul(b, s);
 
-    intInputVal1 = _mm_cvtps_epi32(ret1);
-    intInputVal2 = _mm_cvtps_epi32(ret2);
+      simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
 
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
+      srslte_simd_s_storeu(&z[i], i16);
+    }
   }
+#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE */
 
-  number = eighthPoints * 8;
-  for(; number < len; number++){
-    z[number] = (int16_t) (x[number] * scale);
+  for(; i < len; i++){
+    z[i] = (int16_t) (x[i] * scale);
   }
-#endif
 }
 
+float srslte_vec_acc_ff_simd(float *x, int len) {
+  int i = 0;
+  float acc_sum = 0.0f;
 
-// for enb no-volk
-void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 4;
+#if SRSLTE_SIMD_F_SIZE
+  simd_f_t simd_sum = srslte_simd_f_zero();
 
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  float* zPtr = (float*) z;
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
 
-  __m128 xVal, yVal, zVal;
-  for(;number < points; number++){
+      simd_sum = srslte_simd_f_add(simd_sum, a);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
 
-    xVal = _mm_loadu_ps(xPtr);
-    yVal = _mm_loadu_ps(yPtr);
-
-    zVal = _mm_add_ps(xVal, yVal);
-
-    _mm_storeu_ps(zPtr, zVal);
-
-    xPtr += 4;
-    yPtr += 4;
-    zPtr += 4;
+      simd_sum = srslte_simd_f_add(simd_sum, a);
+    }
   }
 
-  number = points * 4;
-  for(;number < len; number++){
-    z[number] = x[number] + y[number];
+  __attribute__((aligned(SRSLTE_SIMD_F_SIZE*4))) float sum[SRSLTE_SIMD_F_SIZE];
+  srslte_simd_f_store(sum, simd_sum);
+  for (int k = 0; k < SRSLTE_SIMD_F_SIZE; k++) {
+    acc_sum += sum[k];
   }
 #endif
-}
 
-void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
-#ifdef LV_HAVE_AVX
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const float* xPtr = (const float*) x;
-  const float* yPtr = (const float*) y;
-  float* zPtr = (float*) z;
-
-  __m256 xVal, yVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm256_loadu_ps(xPtr);
-    yVal = _mm256_loadu_ps(yPtr);
-
-    zVal = _mm256_add_ps(xVal, yVal);
-
-    _mm256_storeu_ps(zPtr, zVal);
-
-    xPtr += 8;
-    yPtr += 8;
-    zPtr += 8;
+  for (; i<len; i++) {
+    acc_sum += x[i];
   }
 
-  for(number = points * 8;number < len; number++){
-    z[number] = x[number] + y[number];
-  }
-#endif
+  return acc_sum;
 }
 
 cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) {
@@ -570,6 +540,34 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len)
   return result;
 }
 
+void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_f_t s = srslte_simd_f_load(&y[i]);
+
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t r = srslte_simd_cf_mul(a, s);
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t s = srslte_simd_f_loadu(&y[i]);
+
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t r = srslte_simd_cf_mul(a, s);
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i<len; i++) {
+    z[i] = x[i] * y[i];
+  }
+}
+
 void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len) {
   int i = 0;
 
@@ -630,17 +628,29 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
   }
 }
 
-void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
+void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
   int i = 0;
 
 #if SRSLTE_SIMD_F_SIZE
-  for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
-    simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
-    simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
+  if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
+      SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
+      simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
 
-    simd_cf_t r = srslte_simd_cf_prod(a, b);
+      simd_cf_t r = srslte_simd_cf_prod(a, b);
 
-    srslte_simd_cf_store(&r_re[i], &r_im[i], r);
+      srslte_simd_cf_store(&r_re[i], &r_im[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cf_loadu(&a_re[i], &a_im[i]);
+      simd_cf_t b = srslte_simd_cf_loadu(&b_re[i], &b_im[i]);
+
+      simd_cf_t r = srslte_simd_cf_prod(a, b);
+
+      srslte_simd_cf_storeu(&r_re[i], &r_im[i], r);
+    }
   }
 #endif
 
@@ -655,13 +665,25 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i
   int i = 0;
 
 #if SRSLTE_SIMD_C16_SIZE
-  for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
-    simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
-    simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
+  if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
+      SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
+    for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
+      simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
+      simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
 
-    simd_c16_t r = srslte_simd_c16_prod(a, b);
+      simd_c16_t r = srslte_simd_c16_prod(a, b);
 
-    srslte_simd_c16_store(&r_re[i], &r_im[i], r);
+      srslte_simd_c16_store(&r_re[i], &r_im[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
+      simd_c16_t a = srslte_simd_c16_loadu(&a_re[i], &a_im[i]);
+      simd_c16_t b = srslte_simd_c16_loadu(&b_re[i], &b_im[i]);
+
+      simd_c16_t r = srslte_simd_c16_prod(a, b);
+
+      srslte_simd_c16_storeu(&r_re[i], &r_im[i], r);
+    }
   }
 #endif
 
@@ -701,6 +723,103 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
   }
 }
 
+void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_load(&y[i]);
+
+      simd_cf_t rcpb = srslte_simd_cf_rcp(b);
+      simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
+
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
+      simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
+
+      simd_cf_t rcpb = srslte_simd_cf_rcp(b);
+      simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
+
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+
+void srslte_vec_div_cfc_simd(cf_t *x,float *y, cf_t *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_CF_SIZE && SRSLTE_SIMD_CF_SIZE == SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
+
+      srslte_simd_cfi_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
+
+      srslte_simd_cfi_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
+void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) {
+  int i = 0;
+
+#if SRSLTE_SIMD_F_SIZE
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+      simd_f_t b = srslte_simd_f_load(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_f_t r = srslte_simd_f_mul(a, rcpb);
+
+      srslte_simd_f_store(&z[i], r);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+      simd_f_t b = srslte_simd_f_loadu(&y[i]);
+
+      simd_f_t rcpb = srslte_simd_f_rcp(b);
+      simd_f_t r = srslte_simd_f_mul(a, rcpb);
+
+      srslte_simd_f_storeu(&z[i], r);
+    }
+  }
+#endif
+
+  for (; i < len; i++) {
+    z[i] = x[i] / y[i];
+  }
+}
+
 void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
   int i = 0;
 
@@ -895,3 +1014,137 @@ void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len) {
     dst[i] = src[i];
   }
 }
+
+uint32_t srslte_vec_max_fi_simd(float *x, int len) {
+  int i = 0;
+
+  float max_value = -INFINITY;
+  uint32_t max_index = 0;
+
+#if SRSLTE_SIMD_I_SIZE
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
+  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
+  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
+  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
+
+  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
+
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t a = srslte_simd_f_load(&x[i]);
+
+      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t a = srslte_simd_f_loadu(&x[i]);
+
+      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  }
+
+  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
+  srslte_simd_f_store(values_buffer, simd_max_values);
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
+    if (values_buffer[k] > max_value) {
+      max_value = values_buffer[k];
+      max_index = (uint32_t) indexes_buffer[k];
+    }
+  }
+#endif /* SRSLTE_SIMD_I_SIZE */
+
+  for (; i < len; i++) {
+    if (x[i] > max_value) {
+      max_value = x[i];
+      max_index = (uint32_t)i;
+    }
+  }
+
+  return max_index;
+}
+
+uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
+  int i = 0;
+
+  float max_value = -INFINITY;
+  uint32_t max_index = 0;
+
+#if SRSLTE_SIMD_I_SIZE
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
+  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
+  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
+  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
+
+  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
+
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+
+      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
+      simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
+
+      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
+      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
+
+      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
+
+      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
+
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  }
+
+  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
+  srslte_simd_f_store(values_buffer, simd_max_values);
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
+    if (values_buffer[k] > max_value) {
+      max_value = values_buffer[k];
+      max_index = (uint32_t) indexes_buffer[k];
+    }
+  }
+#endif /* SRSLTE_SIMD_I_SIZE */
+
+  for (; i < len; i++) {
+    cf_t a = x[i];
+    float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
+    if (abs2 > max_value) {
+      max_value = abs2;
+      max_index = (uint32_t)i;
+    }
+  }
+
+  return max_index;
+}

From 94a06867a3c3efbb8c6eb36e1ac2fa5f1aa2dc07 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Fri, 29 Sep 2017 16:42:46 +0200
Subject: [PATCH 23/55] Optimized SIMD includes and solved AVX512 bugs

---
 CMakeLists.txt                       |  4 +-
 lib/include/srslte/phy/utils/mat.h   |  3 -
 lib/include/srslte/phy/utils/simd.h  | 97 ++++++++++++++++++++--------
 lib/src/phy/mimo/precoding.c         |  5 +-
 lib/src/phy/utils/test/mat_test.c    |  1 -
 lib/src/phy/utils/test/vector_test.c |  2 -
 lib/src/phy/utils/vector_simd.c      | 10 +--
 7 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28afbc0d8..efaa1973a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -283,8 +283,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
   endif (HAVE_AVX2)
 
   if (HAVE_AVX512)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -DLV_HAVE_AVX512")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DLV_HAVE_AVX512")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512cd -DLV_HAVE_AVX512")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -DLV_HAVE_AVX512")
   endif(HAVE_AVX512)
 
   if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
diff --git a/lib/include/srslte/phy/utils/mat.h b/lib/include/srslte/phy/utils/mat.h
index 942559955..339cfea23 100644
--- a/lib/include/srslte/phy/utils/mat.h
+++ b/lib/include/srslte/phy/utils/mat.h
@@ -60,7 +60,6 @@ SRSLTE_API float srslte_mat_2x2_cn(cf_t h00,
 
 
 #ifdef LV_HAVE_SSE
-#include <smmintrin.h>
 
 /* SSE implementation for complex reciprocal */
 SRSLTE_API __m128 srslte_mat_cf_recip_sse(__m128 a);
@@ -84,8 +83,6 @@ SRSLTE_API void srslte_mat_2x2_mmse_sse(__m128 y0, __m128 y1,
 
 #ifdef LV_HAVE_AVX
 
-#include <immintrin.h>
-
 /* AVX implementation for complex reciprocal */
 SRSLTE_API __m256 srslte_mat_cf_recip_avx(__m256 a);
 
diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 9a5f15dbb..2590794f2 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -27,7 +27,12 @@
 #ifndef SRSLTE_SIMD_H_H
 #define SRSLTE_SIMD_H_H
 
+#ifdef LV_HAVE_SSE /* AVX, AVX2, FMA, AVX512  are in this group */
+#ifndef __OPTIMIZE__
+#define __OPTIMIZE__
+#endif
 #include <immintrin.h>
+#endif /* LV_HAVE_SSE */
 
 /*
  * SSE Macros
@@ -233,7 +238,7 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
 
 static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
 #ifdef LV_HAVE_AVX512
-  return _mm512_rcp_ps(a);
+  return _mm512_rcp14_ps(a);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
   return _mm256_rcp_ps(a);
@@ -372,10 +377,16 @@ typedef struct {
 static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) {
   simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
-  __m512 in1 = _mm512_permute_ps(_mm512_load_ps((float*)(ptr)), 0b11011000);
-  __m512 in2 = _mm512_permute_ps(_mm512_load_ps((float*)(ptr + 8)), 0b11011000);
-  ret.re = _mm512_unpacklo_ps(in1, in2);
-  ret.im = _mm512_unpackhi_ps(in1, in2);
+  __m512 in1 = _mm512_load_ps((float*)(ptr));
+  __m512 in2 = _mm512_load_ps((float*)(ptr + SRSLTE_SIMD_CF_SIZE/2));
+  ret.re = _mm512_permutex2var_ps(in1, _mm512_setr_epi32(0x00, 0x02, 0x04, 0x06,
+                                                         0x08, 0x0A, 0x0C, 0x0E,
+                                                         0x10, 0x12, 0x14, 0x16,
+                                                         0x18, 0x1A, 0x1C, 0x1E), in2);
+  ret.im = _mm512_permutex2var_ps(in1, _mm512_setr_epi32(0x01, 0x03, 0x05, 0x07,
+                                                         0x09, 0x0B, 0x0D, 0x0F,
+                                                         0x11, 0x13, 0x15, 0x17,
+                                                         0x19, 0x1B, 0x1D, 0x1F), in2);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
   __m256 in1 = _mm256_permute_ps(_mm256_load_ps((float*)(ptr)), 0b11011000);
@@ -398,10 +409,16 @@ static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) {
 static inline simd_cf_t srslte_simd_cfi_loadu(cf_t *ptr) {
   simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
-  __m512 in1 = _mm512_permute_ps(_mm512_loadu_ps((float*)(ptr)), 0b11011000);
-  __m512 in2 = _mm512_permute_ps(_mm512_loadu_ps((float*)(ptr + 8)), 0b11011000);
-  ret.re = _mm512_unpacklo_ps(in1, in2);
-  ret.im = _mm512_unpackhi_ps(in1, in2);
+  __m512 in1 = _mm512_loadu_ps((float*)(ptr));
+  __m512 in2 = _mm512_loadu_ps((float*)(ptr + SRSLTE_SIMD_CF_SIZE/2));
+  ret.re = _mm512_permutex2var_ps(in1, _mm512_setr_epi32(0x00, 0x02, 0x04, 0x06,
+                                                         0x08, 0x0A, 0x0C, 0x0E,
+                                                         0x10, 0x12, 0x14, 0x16,
+                                                         0x18, 0x1A, 0x1C, 0x1E), in2);
+  ret.im = _mm512_permutex2var_ps(in1, _mm512_setr_epi32(0x01, 0x03, 0x05, 0x07,
+                                                         0x09, 0x0B, 0x0D, 0x0F,
+                                                         0x11, 0x13, 0x15, 0x17,
+                                                         0x19, 0x1B, 0x1D, 0x1F), in2);
 #else /* LV_HAVE_AVX512 */
   #ifdef LV_HAVE_AVX2
   __m256 in1 = _mm256_permute_ps(_mm256_loadu_ps((float*)(ptr)), 0b11011000);
@@ -460,10 +477,16 @@ static inline simd_cf_t srslte_simd_cf_loadu(float *re, float *im) {
 
 static inline void srslte_simd_cfi_store(cf_t *ptr, simd_cf_t simdreg) {
 #ifdef LV_HAVE_AVX512
-  __m512 out1 = _mm512_permute_ps(simdreg.re, 0b11011000);
-  __m512 out2 = _mm512_permute_ps(simdreg.im, 0b11011000);
-  _mm512_store_ps((float*)(ptr), _mm512_unpacklo_ps(out1, out2));
-  _mm512_store_ps((float*)(ptr + 8), _mm512_unpackhi_ps(out1, out2));
+  __m512 s1 = _mm512_permutex2var_ps(simdreg.re, _mm512_setr_epi32(0x00, 0x10, 0x01, 0x11,
+                                                                   0x02, 0x12, 0x03, 0x13,
+                                                                   0x04, 0x14, 0x05, 0x15,
+                                                                   0x06, 0x16, 0x07, 0x17), simdreg.im);
+  __m512 s2 = _mm512_permutex2var_ps(simdreg.re, _mm512_setr_epi32(0x08, 0x18, 0x09, 0x19,
+                                                                   0x0A, 0x1A, 0x0B, 0x1B,
+                                                                   0x0C, 0x1C, 0x0D, 0x1D,
+                                                                   0x0E, 0x1E, 0x0F, 0x1F), simdreg.im);
+  _mm512_store_ps((float*)(ptr), s1);
+  _mm512_store_ps((float*)(ptr + 8), s2);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
   __m256 out1 = _mm256_permute_ps(simdreg.re, 0b11011000);
@@ -481,10 +504,16 @@ static inline void srslte_simd_cfi_store(cf_t *ptr, simd_cf_t simdreg) {
 
 static inline void srslte_simd_cfi_storeu(cf_t *ptr, simd_cf_t simdreg) {
 #ifdef LV_HAVE_AVX512
-  __m512 out1 = _mm512_permute_ps(simdreg.re, 0b11011000);
-  __m512 out2 = _mm512_permute_ps(simdreg.im, 0b11011000);
-  _mm512_storeu_ps((float*)(ptr), _mm512_unpacklo_ps(out1, out2));
-  _mm512_storeu_ps((float*)(ptr + 8), _mm512_unpackhi_ps(out1, out2));
+  __m512 s1 = _mm512_permutex2var_ps(simdreg.re, _mm512_setr_epi32(0x00, 0x10, 0x01, 0x11,
+                                                                   0x02, 0x12, 0x03, 0x13,
+                                                                   0x04, 0x14, 0x05, 0x15,
+                                                                   0x06, 0x16, 0x07, 0x17), simdreg.im);
+  __m512 s2 = _mm512_permutex2var_ps(simdreg.re, _mm512_setr_epi32(0x08, 0x18, 0x09, 0x19,
+                                                                   0x0A, 0x1A, 0x0B, 0x1B,
+                                                                   0x0C, 0x1C, 0x0D, 0x1D,
+                                                                   0x0E, 0x1E, 0x0F, 0x1F), simdreg.im);
+  _mm512_storeu_ps((float*)(ptr), s1);
+  _mm512_storeu_ps((float*)(ptr + 8), s2);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
   __m256 out1 = _mm256_permute_ps(simdreg.re, 0b11011000);
@@ -625,7 +654,6 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
 static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
     simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
-  b = _mm512_permutexvar_ps(b, _mm512_setr_epi32(0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
   ret.re = _mm512_mul_ps(a.re, b);
   ret.im = _mm512_mul_ps(a.im, b);
 #else /* LV_HAVE_AVX512 */
@@ -649,7 +677,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
   simd_f_t a2re = _mm512_mul_ps(a.re, a.re);
   simd_f_t a2im = _mm512_mul_ps(a.im, a.im);
   simd_f_t mod2 = _mm512_add_ps(a2re, a2im);
-  simd_f_t rcp = _mm512_rcp_ps(mod2);
+  simd_f_t rcp = _mm512_rcp14_ps(mod2);
   simd_f_t neg_a_im = _mm512_xor_ps(_mm512_set1_ps(-0.0f), a.im);
   ret.re = _mm512_mul_ps(a.re, rcp);
   ret.im = _mm512_mul_ps(neg_a_im, rcp);
@@ -702,12 +730,15 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {
 
 #ifdef LV_HAVE_AVX512
 typedef __m512i simd_i_t;
+typedef __mmask16 simd_sel_t;
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
 typedef __m256i simd_i_t;
+typedef __m256 simd_sel_t;
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
 typedef __m128i simd_i_t;
+typedef __m128i simd_sel_t;
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -768,12 +799,12 @@ static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
 #endif /* LV_HAVE_AVX512 */
 }
 
-static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
+static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
-  return (simd_i_t) _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
+  return _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
-  return (simd_i_t) _mm256_cmp_ps(a, b, _CMP_GT_OS);
+  return _mm256_cmp_ps(a, b, _CMP_GT_OS);
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
   return  (simd_i_t) _mm_cmpgt_ps(a, b);
@@ -782,15 +813,15 @@ static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #endif /* LV_HAVE_AVX512 */
 }
 
-static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_i_t selector) {
+static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t selector) {
 #ifdef LV_HAVE_AVX512
-  return (__m512i) _mm512_blendv_ps((__m512)a, (__m512) b, (__m512) selector);
+  return (__m512i) _mm512_mask_blend_ps( selector, (__m512)a, (__m512) b);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
-  return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b,(__m256) selector);
+  return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b, selector);
 #else
   #ifdef LV_HAVE_SSE
-  return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, (__m128)selector);
+  return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -1127,6 +1158,19 @@ static inline simd_c16_t srslte_simd_c16_zero (void) {
 #if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
 
 static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
+#ifdef LV_HAVE_AVX512
+  __m512 aa = _mm512_permutex2var_ps(a, _mm512_setr_epi32(0x00, 0x01, 0x02, 0x03,
+                                                          0x08, 0x09, 0x0A, 0x0B,
+                                                          0x10, 0x11, 0x12, 0x13,
+                                                          0x18, 0x19, 0x1A, 0x1B), b);
+  __m512 bb = _mm512_permutex2var_ps(a, _mm512_setr_epi32(0x04, 0x05, 0x06, 0x07,
+                                                          0x0C, 0x0D, 0x0E, 0x0F,
+                                                          0x14, 0x15, 0x16, 0x17,
+                                                          0x1C, 0x1D, 0x1E, 0x1F), b);
+  __m512i ai = _mm512_cvttps_epi32(aa);
+  __m512i bi = _mm512_cvttps_epi32(bb);
+  return _mm512_packs_epi32(ai, bi);
+#else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
   __m256 aa = _mm256_permute2f128_ps(a, b, 0x20);
   __m256 bb = _mm256_permute2f128_ps(a, b, 0x31);
@@ -1140,6 +1184,7 @@ static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
   return _mm_packs_epi32(ai, bi);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
 }
 
 #endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_C16_SIZE */
diff --git a/lib/src/phy/mimo/precoding.c b/lib/src/phy/mimo/precoding.c
index 8f8bd7737..21350524b 100644
--- a/lib/src/phy/mimo/precoding.c
+++ b/lib/src/phy/mimo/precoding.c
@@ -33,20 +33,17 @@
 #include "srslte/phy/mimo/precoding.h"
 #include "srslte/phy/utils/vector.h"
 #include "srslte/phy/utils/debug.h"
+#include "srslte/phy/utils/mat.h"
 
 #ifdef LV_HAVE_SSE
-#include <immintrin.h>
 int srslte_predecoding_single_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 int srslte_predecoding_diversity2_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_symbols);
 #endif
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
 int srslte_predecoding_single_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 #endif
 
-#include "srslte/phy/utils/mat.h"
-
 static srslte_mimo_decoder_t mimo_decoder = SRSLTE_MIMO_DECODER_MMSE;
 
 /************************************************
diff --git a/lib/src/phy/utils/test/mat_test.c b/lib/src/phy/utils/test/mat_test.c
index 46081da98..0bfb482a9 100644
--- a/lib/src/phy/utils/test/mat_test.c
+++ b/lib/src/phy/utils/test/mat_test.c
@@ -29,7 +29,6 @@
 #include <unistd.h>
 #include <complex.h>
 #include <stdbool.h>
-#include <immintrin.h>
 #include <sys/time.h>
 
 #include "srslte/phy/utils/mat.h"
diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c
index cf0f38926..8d5b9f2d6 100644
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@@ -29,9 +29,7 @@
 #include <unistd.h>
 #include <complex.h>
 #include <stdbool.h>
-#include <immintrin.h>
 #include <sys/time.h>
-#include <srslte/phy/utils/vector_simd.h>
 #include <memory.h>
 #include <math.h>
 
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index 2dd354548..109c99717 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -556,7 +556,7 @@ void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len) {
     for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
       simd_f_t s = srslte_simd_f_loadu(&y[i]);
 
-      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
+      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
       simd_cf_t r = srslte_simd_cf_mul(a, s);
       srslte_simd_cfi_storeu(&z[i], r);
     }
@@ -1036,7 +1036,7 @@ uint32_t srslte_vec_max_fi_simd(float *x, int len) {
     for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
       simd_f_t a = srslte_simd_f_load(&x[i]);
 
-      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
+      simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
 
       simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
       simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
@@ -1046,7 +1046,7 @@ uint32_t srslte_vec_max_fi_simd(float *x, int len) {
     for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
       simd_f_t a = srslte_simd_f_loadu(&x[i]);
 
-      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
+      simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
 
       simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
       simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
@@ -1102,7 +1102,7 @@ uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
 
       simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
 
-      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
+      simd_sel_t res = srslte_simd_f_max(z1, simd_max_values);
 
       simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
       simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
@@ -1118,7 +1118,7 @@ uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
 
       simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
 
-      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
+      simd_sel_t res = srslte_simd_f_max(z1, simd_max_values);
 
       simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
       simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);

From d6bdabfdc09ddd315e5285ae46e335f59557d9fd Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 29 Sep 2017 20:38:12 +0200
Subject: [PATCH 24/55] Changed all harq delays to variables

---
 lib/include/srslte/common/common.h |  6 +++++
 srsenb/hdr/mac/scheduler_ue.h      |  2 +-
 srsenb/hdr/mac/ue.h                |  2 +-
 srsenb/hdr/phy/phch_common.h       |  8 +++----
 srsenb/src/mac/mac.cc              |  6 ++---
 srsenb/src/mac/scheduler.cc        |  8 +++----
 srsenb/src/mac/scheduler_harq.cc   |  2 +-
 srsenb/src/mac/scheduler_ue.cc     |  4 ++--
 srsenb/src/phy/phch_common.cc      |  4 ++--
 srsenb/src/phy/phch_worker.cc      | 35 +++++++++++++++---------------
 srsue/hdr/mac/mac.h                |  2 +-
 srsue/hdr/phy/phch_common.h        |  2 +-
 srsue/src/mac/proc_bsr.cc          |  2 +-
 srsue/src/phy/phch_common.cc       | 14 ++++++------
 srsue/src/phy/phch_recv.cc         |  4 ++--
 srsue/src/phy/phch_worker.cc       | 26 +++++++++++-----------
 16 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/lib/include/srslte/common/common.h b/lib/include/srslte/common/common.h
index 7156fbfc9..ea89cc8b7 100644
--- a/lib/include/srslte/common/common.h
+++ b/lib/include/srslte/common/common.h
@@ -44,6 +44,12 @@
 #define SRSLTE_N_DRB           8
 #define SRSLTE_N_RADIO_BEARERS 11
 
+#define HARQ_SFMOD   10
+#define HARQ_DELAY_MS 4
+#define MSG3_DELAY_MS 6
+#define HARQ_TX(tti) ((tti+HARQ_DELAY_MS)%10240)
+#define HARQ_RX(tti) ((tti+(2*HARQ_DELAY_MS))%10240)
+
 // Cat 3 UE - Max number of DL-SCH transport block bits received within a TTI
 // 3GPP 36.306 Table 4.1.1
 #define SRSLTE_MAX_BUFFER_SIZE_BITS  102048
diff --git a/srsenb/hdr/mac/scheduler_ue.h b/srsenb/hdr/mac/scheduler_ue.h
index b59461140..c454f1edf 100644
--- a/srsenb/hdr/mac/scheduler_ue.h
+++ b/srsenb/hdr/mac/scheduler_ue.h
@@ -173,7 +173,7 @@ private:
   // Allowed DCI locations per CFI and per subframe    
   sched_dci_cce_t dci_locations[3][10];   
 
-  const static int SCHED_MAX_HARQ_PROC = 8; 
+  const static int SCHED_MAX_HARQ_PROC = 2*HARQ_DELAY_MS;
   dl_harq_proc dl_harq[SCHED_MAX_HARQ_PROC]; 
   ul_harq_proc ul_harq[SCHED_MAX_HARQ_PROC]; 
   
diff --git a/srsenb/hdr/mac/ue.h b/srsenb/hdr/mac/ue.h
index b879d040b..0f95a1144 100644
--- a/srsenb/hdr/mac/ue.h
+++ b/srsenb/hdr/mac/ue.h
@@ -120,7 +120,7 @@ private:
   
   uint32_t nof_failures; 
   
-  const static int NOF_HARQ_PROCESSES = 8; 
+  const static int NOF_HARQ_PROCESSES = 2*HARQ_DELAY_MS;
   srslte_softbuffer_tx_t softbuffer_tx[NOF_HARQ_PROCESSES];
   srslte_softbuffer_rx_t softbuffer_rx[NOF_HARQ_PROCESSES];
 
diff --git a/srsenb/hdr/phy/phch_common.h b/srsenb/hdr/phy/phch_common.h
index 00a59d969..b6c4ceb08 100644
--- a/srsenb/hdr/phy/phch_common.h
+++ b/srsenb/hdr/phy/phch_common.h
@@ -78,13 +78,13 @@ public:
   mac_interface_phy *mac; 
   
   // Common objects for schedulign grants 
-  mac_interface_phy::ul_sched_t ul_grants[10];
-  mac_interface_phy::dl_sched_t dl_grants[10];
+  mac_interface_phy::ul_sched_t ul_grants[HARQ_SFMOD];
+  mac_interface_phy::dl_sched_t dl_grants[HARQ_SFMOD];
   
   // Map of pending ACKs for each user 
   typedef struct {
-    bool is_pending[10]; 
-    uint16_t n_pdcch[10];
+    bool is_pending[HARQ_SFMOD];
+    uint16_t n_pdcch[HARQ_SFMOD];
   } pending_ack_t;
   std::map<uint16_t,pending_ack_t> pending_ack;
   
diff --git a/srsenb/src/mac/mac.cc b/srsenb/src/mac/mac.cc
index 7cb30cc65..0528852be 100644
--- a/srsenb/src/mac/mac.cc
+++ b/srsenb/src/mac/mac.cc
@@ -406,7 +406,7 @@ int mac::get_dl_sched(uint32_t tti, dl_sched_t *dl_sched_res)
   log_step_dl(tti);
 
   if (!started) {
-    return 0; 
+    return 0;
   }
   
   if (!dl_sched_res) {
@@ -604,7 +604,7 @@ int mac::get_ul_sched(uint32_t tti, ul_sched_t *ul_sched_res)
 
 void mac::log_step_ul(uint32_t tti) 
 {
-  int tti_ul = tti-8;
+  int tti_ul = tti-(2*HARQ_DELAY_MS);
   if (tti_ul < 0) {
     tti_ul += 10240;
   }
@@ -613,7 +613,7 @@ void mac::log_step_ul(uint32_t tti)
 
 void mac::log_step_dl(uint32_t tti) 
 {
-  int tti_dl = tti-4;
+  int tti_dl = tti-HARQ_DELAY_MS;
   if (tti_dl < 0) {
     tti_dl += 10240;
   }
diff --git a/srsenb/src/mac/scheduler.cc b/srsenb/src/mac/scheduler.cc
index 79cf3f476..5de780470 100644
--- a/srsenb/src/mac/scheduler.cc
+++ b/srsenb/src/mac/scheduler.cc
@@ -541,7 +541,7 @@ int sched::dl_sched_rar(dl_sched_rar_t rar[MAX_RAR_LIST])
                 pending_rar[j].rar_tti = 0;            
                 
                 // Save UL resources 
-                uint32_t pending_tti=(current_tti+6)%10;
+                uint32_t pending_tti=(current_tti+MSG3_DELAY_MS)%10;
                 pending_msg3[pending_tti].enabled = true; 
                 pending_msg3[pending_tti].rnti    = pending_rar[j].rnti; 
                 pending_msg3[pending_tti].L       = L_prb; 
@@ -677,7 +677,7 @@ int sched::ul_sched(uint32_t tti, srsenb::sched_interface::ul_sched_res_t* sched
   pthread_mutex_lock(&mutex);
 
   /* If dl_sched() not yet called this tti (this tti is +4ms advanced), reset CCE state */
-  if ((current_tti+4)%10240 != tti) {
+  if (HARQ_TX(current_tti) != tti) {
     bzero(used_cce, MAX_CCE*sizeof(bool));    
   }
   
@@ -685,9 +685,9 @@ int sched::ul_sched(uint32_t tti, srsenb::sched_interface::ul_sched_res_t* sched
   current_tti = tti; 
   sfn = tti/10;
   if (tti > 4) {
-    sf_idx = (tti-4)%10; 
+    sf_idx = (tti-HARQ_DELAY_MS)%10;
   } else {
-    sf_idx = (tti+10240-4)%10;
+    sf_idx = (tti+10240-HARQ_DELAY_MS)%10;
   }
   int nof_dci_elems   = 0; 
   int nof_phich_elems = 0; 
diff --git a/srsenb/src/mac/scheduler_harq.cc b/srsenb/src/mac/scheduler_harq.cc
index a6ae70d19..f5209b374 100644
--- a/srsenb/src/mac/scheduler_harq.cc
+++ b/srsenb/src/mac/scheduler_harq.cc
@@ -177,7 +177,7 @@ void dl_harq_proc::set_rbgmask(uint32_t new_mask)
 
 bool dl_harq_proc::has_pending_retx(uint32_t current_tti)
 {
-  return srslte_tti_interval(current_tti, tti) >= 8 && has_pending_retx_common(); 
+  return srslte_tti_interval(current_tti, tti) >= (2*HARQ_DELAY_MS) && has_pending_retx_common();
 }
 
 int dl_harq_proc::get_tbs()
diff --git a/srsenb/src/mac/scheduler_ue.cc b/srsenb/src/mac/scheduler_ue.cc
index 1534d12ef..567711cbe 100644
--- a/srsenb/src/mac/scheduler_ue.cc
+++ b/srsenb/src/mac/scheduler_ue.cc
@@ -247,7 +247,7 @@ bool sched_ue::get_pucch_sched(uint32_t current_tti, uint32_t prb_idx[2], uint32
   
   // First check if it has pending ACKs 
   for (int i=0;i<SCHED_MAX_HARQ_PROC;i++) {
-    if (((dl_harq[i].get_tti()+4)%10240) == current_tti) {
+    if (HARQ_TX(dl_harq[i].get_tti()) == current_tti) {
       uint32_t n_pucch = srslte_pucch_get_npucch(dl_harq[i].get_n_cce(), SRSLTE_PUCCH_FORMAT_1A, has_sr, &pucch_sched);
       if (prb_idx) {
         for (int i=0;i<2;i++) {
@@ -295,7 +295,7 @@ bool sched_ue::get_pucch_sched(uint32_t current_tti, uint32_t prb_idx[2], uint32
 int sched_ue::set_ack_info(uint32_t tti, bool ack)
 {
   for (int i=0;i<SCHED_MAX_HARQ_PROC;i++) {
-    if (((dl_harq[i].get_tti()+4)%10240) == tti) {
+    if (HARQ_TX(dl_harq[i].get_tti()) == tti) {
       Debug("SCHED: Set ACK=%d for rnti=0x%x, pid=%d, tti=%d\n", ack, rnti, i, tti);
       dl_harq[i].set_ack(ack); 
       return dl_harq[i].get_tbs();
diff --git a/srsenb/src/phy/phch_common.cc b/srsenb/src/phy/phch_common.cc
index e4d91581e..f14d50c5e 100644
--- a/srsenb/src/phy/phch_common.cc
+++ b/srsenb/src/phy/phch_common.cc
@@ -48,8 +48,8 @@ void phch_common::set_nof_mutex(uint32_t nof_mutex_) {
 }
 
 void phch_common::reset() {
-  bzero(ul_grants, sizeof(mac_interface_phy::ul_sched_t)*10);
-  bzero(dl_grants, sizeof(mac_interface_phy::dl_sched_t)*10);
+  bzero(ul_grants, sizeof(mac_interface_phy::ul_sched_t)*HARQ_SFMOD);
+  bzero(dl_grants, sizeof(mac_interface_phy::dl_sched_t)*HARQ_SFMOD);
 }
 
 bool phch_common::init(srslte_cell_t *cell_, srslte::radio* radio_h_, mac_interface_phy *mac_)
diff --git a/srsenb/src/phy/phch_worker.cc b/srsenb/src/phy/phch_worker.cc
index 3a1fb8ca4..f81d5fbd5 100644
--- a/srsenb/src/phy/phch_worker.cc
+++ b/srsenb/src/phy/phch_worker.cc
@@ -176,11 +176,11 @@ cf_t* phch_worker::get_buffer_rx()
 void phch_worker::set_time(uint32_t tti_, uint32_t tx_mutex_cnt_, srslte_timestamp_t tx_time_)
 {
   tti_rx       = tti_; 
-  tti_tx       = (tti_ + 4)%10240; 
-  tti_sched_ul = (tti_ + 8)%10240; 
-  sf_rx        = tti_rx%10;
-  sf_tx        = tti_tx%10;
-  sf_sched_ul  = tti_sched_ul%10;
+  tti_tx       = HARQ_TX(tti_rx);
+  tti_sched_ul = HARQ_RX(tti_rx);
+  sf_rx        = tti_rx%HARQ_SFMOD;
+  sf_tx        = tti_tx%HARQ_SFMOD;
+  sf_sched_ul  = tti_sched_ul%HARQ_SFMOD;
   tx_mutex_cnt = tx_mutex_cnt_;
   memcpy(&tx_time, &tx_time_, sizeof(srslte_timestamp_t));
 }
@@ -245,7 +245,7 @@ void phch_worker::rem_rnti(uint16_t rnti)
     srslte_enb_ul_rem_rnti(&enb_ul, rnti);
     
     // remove any pending grant for each subframe 
-    for (uint32_t i=0;i<10;i++) {
+    for (uint32_t i=0;i<HARQ_SFMOD;i++) {
       for (uint32_t j=0;j<phy->ul_grants[i].nof_grants;j++) {
         if (phy->ul_grants[i].sched_grants[j].rnti == rnti) {
           phy->ul_grants[i].sched_grants[j].rnti = 0; 
@@ -266,7 +266,7 @@ void phch_worker::rem_rnti(uint16_t rnti)
 void phch_worker::work_imp()
 {
   uint32_t sf_ack;
-
+  
   if (!running) {
     return;
   }
@@ -326,12 +326,12 @@ void phch_worker::work_imp()
   encode_phich(ul_grants[sf_sched_ul].phich, ul_grants[sf_sched_ul].nof_phich, sf_tx);
   
   // Prepare for receive ACK for DL grants in sf_tx+4
-  sf_ack = (sf_tx+4)%10; 
+  sf_ack = HARQ_RX(sf_tx)%HARQ_SFMOD;
   phy->ack_clear(sf_ack);
   for (uint32_t i=0;i<dl_grants[sf_tx].nof_grants;i++) {
     // SI-RNTI and RAR-RNTI do not have ACK
     if (dl_grants[sf_tx].sched_grants[i].rnti >= SRSLTE_CRNTI_START && dl_grants[sf_tx].sched_grants[i].rnti <= SRSLTE_CRNTI_END) {
-      phy->ack_set_pending(sf_ack, dl_grants[sf_tx].sched_grants[i].rnti, dl_grants[sf_tx].sched_grants[i].location.ncce);      
+      phy->ack_set_pending(sf_ack, dl_grants[sf_tx].sched_grants[i].rnti, dl_grants[sf_tx].sched_grants[i].location.ncce);
     }
   }
   
@@ -504,8 +504,7 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch,
 
 int phch_worker::decode_pucch(uint32_t tti_rx)
 {
-  uint32_t sf_rx = tti_rx%10;
-  srslte_uci_data_t uci_data; 
+  srslte_uci_data_t uci_data;
   
   for(std::map<uint16_t, ue>::iterator iter=ue_db.begin(); iter!=ue_db.end(); ++iter) {
     uint16_t rnti = (uint16_t) iter->first;
@@ -523,7 +522,7 @@ int phch_worker::decode_pucch(uint32_t tti_rx)
           uci_data.scheduling_request = true; 
         }
       }      
-      if (phy->ack_is_pending(sf_rx, rnti, &last_n_pdcch)) {
+      if (phy->ack_is_pending(tti_rx%HARQ_SFMOD, rnti, &last_n_pdcch)) {
         needs_pucch = true; 
         needs_ack = true; 
         uci_data.uci_ack_len = 1; 
@@ -589,7 +588,7 @@ int phch_worker::encode_phich(srslte_enb_dl_phich_t *acks, uint32_t nof_acks, ui
       srslte_enb_dl_put_phich(&enb_dl, acks[i].ack, 
                               ue_db[rnti].phich_info.n_prb_lowest, 
                               ue_db[rnti].phich_info.n_dmrs, 
-                              sf_idx);
+                              sf_idx%10);
       
       Info("PHICH: rnti=0x%x, hi=%d, I_lowest=%d, n_dmrs=%d, tti_tx=%d\n", 
           rnti, acks[i].ack, 
@@ -606,13 +605,13 @@ int phch_worker::encode_pdcch_ul(srslte_enb_ul_pusch_t *grants, uint32_t nof_gra
   for (uint32_t i=0;i<nof_grants;i++) {
     uint16_t rnti = grants[i].rnti;
     if (grants[i].needs_pdcch && rnti) {
-      if (srslte_enb_dl_put_pdcch_ul(&enb_dl, &grants[i].grant, grants[i].location, rnti, sf_idx)) {
+      if (srslte_enb_dl_put_pdcch_ul(&enb_dl, &grants[i].grant, grants[i].location, rnti, sf_idx%10)) {
         fprintf(stderr, "Error putting PUSCH %d\n",i);
         return SRSLTE_ERROR; 
       }
 
-      Info("PDCCH: UL DCI Format0  rnti=0x%x, cce_index=%d, L=%d, tti_tx=%d\n", 
-          rnti, grants[i].location.ncce, (1<<grants[i].location.L), tti_tx);
+      Info("PDCCH: UL DCI Format0  rnti=0x%x, cce_index=%d, L=%d, tpc=%d, tti_tx=%d\n",
+           rnti, grants[i].location.ncce, (1<<grants[i].location.L), grants[i].grant.tpc_pusch, tti_tx);
     }
   }
   return SRSLTE_SUCCESS; 
@@ -633,7 +632,7 @@ int phch_worker::encode_pdcch_dl(srslte_enb_dl_pdsch_t *grants, uint32_t nof_gra
           format = SRSLTE_DCI_FORMAT1A; 
         break;
       }
-      if (srslte_enb_dl_put_pdcch_dl(&enb_dl, &grants[i].grant, format, grants[i].location, rnti, sf_idx)) {
+      if (srslte_enb_dl_put_pdcch_dl(&enb_dl, &grants[i].grant, format, grants[i].location, rnti, sf_idx%10)) {
         fprintf(stderr, "Error putting PDCCH %d\n",i);
         return SRSLTE_ERROR; 
       }      
@@ -683,7 +682,7 @@ int phch_worker::encode_pdsch(srslte_enb_dl_pdsch_t *grants, uint32_t nof_grants
           len = 1; 
         }        
         log_h->info_hex(ptr, len,
-                             "PDSCH: rnti=0x%x, l_crb=%2d, %s, harq=%d, tbs=%d, mcs=%d, rv=%d, tti_tx=%d\n", 
+                             "PDSCH: rnti=0x%x, l_crb=%2d, %s, harq=%d, tbs=%d, mcs=%d, rv=%d, tti_tx=%d\n",
                              rnti, phy_grant.nof_prb, grant_str, grants[i].grant.harq_process, 
                              phy_grant.mcs[0].tbs/8, phy_grant.mcs[0].idx, grants[i].grant.rv_idx, tti_tx);
       }
diff --git a/srsue/hdr/mac/mac.h b/srsue/hdr/mac/mac.h
index a306af187..d19f668bf 100644
--- a/srsue/hdr/mac/mac.h
+++ b/srsue/hdr/mac/mac.h
@@ -109,7 +109,7 @@ private:
   
   static const int MAC_MAIN_THREAD_PRIO = 5; 
   static const int MAC_PDU_THREAD_PRIO  = 6;
-  static const int MAC_NOF_HARQ_PROC    = 8;
+  static const int MAC_NOF_HARQ_PROC    = 2*HARQ_DELAY_MS;
 
   // Interaction with PHY 
   srslte::tti_sync_cv   ttisync; 
diff --git a/srsue/hdr/phy/phch_common.h b/srsue/hdr/phy/phch_common.h
index aa64fe9ea..fb077e733 100644
--- a/srsue/hdr/phy/phch_common.h
+++ b/srsue/hdr/phy/phch_common.h
@@ -138,7 +138,7 @@ namespace srsue {
       uint32_t I_lowest; 
       uint32_t n_dmrs;
     } pending_ack_t;
-    pending_ack_t pending_ack[10];
+    pending_ack_t pending_ack[HARQ_SFMOD];
     
     bool            is_first_tx;
 
diff --git a/srsue/src/mac/proc_bsr.cc b/srsue/src/mac/proc_bsr.cc
index 898943ab9..43694c1bc 100644
--- a/srsue/src/mac/proc_bsr.cc
+++ b/srsue/src/mac/proc_bsr.cc
@@ -368,7 +368,7 @@ bool bsr_proc::need_to_reset_sr() {
 
 bool bsr_proc::need_to_send_sr(uint32_t tti) {
   if (!sr_is_sent && triggered_bsr_type == REGULAR) {
-    if (srslte_tti_interval(tti,next_tx_tti)>0 && srslte_tti_interval(tti,next_tx_tti) < 10240-4) {
+    if (srslte_tti_interval(tti,next_tx_tti)>0 && srslte_tti_interval(tti,next_tx_tti) < 10240-HARQ_DELAY_MS) {
       reset_sr = false; 
       sr_is_sent = true; 
       Debug("BSR:   Need to send sr: sr_is_sent=true, reset_sr=false, tti=%d, next_tx_tti=%d\n", tti, next_tx_tti);
diff --git a/srsue/src/phy/phch_common.cc b/srsue/src/phy/phch_common.cc
index d49b1ced2..cce1857f1 100644
--- a/srsue/src/phy/phch_common.cc
+++ b/srsue/src/phy/phch_common.cc
@@ -195,13 +195,13 @@ void phch_common::set_dl_rnti(srslte_rnti_type_t type, uint16_t rnti_value, int
 }
 
 void phch_common::reset_pending_ack(uint32_t tti) {
-  pending_ack[tti%10].enabled = false; 
+  pending_ack[tti%HARQ_SFMOD].enabled = false;
 }
 
 void phch_common::set_pending_ack(uint32_t tti, uint32_t I_lowest, uint32_t n_dmrs) {
-  pending_ack[tti%10].enabled  = true; 
-  pending_ack[tti%10].I_lowest = I_lowest;       
-  pending_ack[tti%10].n_dmrs = n_dmrs;            
+  pending_ack[tti%HARQ_SFMOD].enabled  = true;
+  pending_ack[tti%HARQ_SFMOD].I_lowest = I_lowest;
+  pending_ack[tti%HARQ_SFMOD].n_dmrs = n_dmrs;
   Debug("Set pending ACK for tti=%d I_lowest=%d, n_dmrs=%d\n", tti, I_lowest, n_dmrs);
 }
 
@@ -211,12 +211,12 @@ bool phch_common::get_pending_ack(uint32_t tti) {
 
 bool phch_common::get_pending_ack(uint32_t tti, uint32_t *I_lowest, uint32_t *n_dmrs) {
   if (I_lowest) {
-    *I_lowest = pending_ack[tti%10].I_lowest;
+    *I_lowest = pending_ack[tti%HARQ_SFMOD].I_lowest;
   }
   if (n_dmrs) {
-    *n_dmrs = pending_ack[tti%10].n_dmrs;
+    *n_dmrs = pending_ack[tti%HARQ_SFMOD].n_dmrs;
   }
-  return pending_ack[tti%10].enabled;
+  return pending_ack[tti%HARQ_SFMOD].enabled;
 }
 
 /* The transmisison of UL subframes must be in sequence. Each worker uses this function to indicate
diff --git a/srsue/src/phy/phch_recv.cc b/srsue/src/phy/phch_recv.cc
index dad2c82b8..647226384 100644
--- a/srsue/src/phy/phch_recv.cc
+++ b/srsue/src/phy/phch_recv.cc
@@ -715,11 +715,11 @@ void phch_recv::run_thread() {
 
               worker->set_sample_offset(srslte_ue_sync_get_sfo(&ue_sync)/1000);
 
-              /* Compute TX time: Any transmission happens in TTI4 thus advance 4 ms the reception time */
+              /* Compute TX time: Any transmission happens in TTI+4 thus advance 4 ms the reception time */
               srslte_timestamp_t rx_time, tx_time, tx_time_prach;
               srslte_ue_sync_get_last_timestamp(&ue_sync, &rx_time);
               srslte_timestamp_copy(&tx_time, &rx_time);
-              srslte_timestamp_add(&tx_time, 0, 4e-3 - time_adv_sec);
+              srslte_timestamp_add(&tx_time, 0, HARQ_DELAY_MS*1e-3 - time_adv_sec);
               worker->set_tx_time(tx_time, next_offset);
               next_offset = 0;
 
diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc
index 1c9abc1ca..1c52568af 100644
--- a/srsue/src/phy/phch_worker.cc
+++ b/srsue/src/phy/phch_worker.cc
@@ -335,7 +335,7 @@ void phch_worker::work_imp()
                  &ul_action.softbuffers[0], ul_action.rv[0], ul_action.rnti, ul_mac_grant.is_from_rar);
     signal_ready = true; 
     if (ul_action.expect_ack) {
-      phy->set_pending_ack(tti + 8, ue_ul.pusch_cfg.grant.n_prb_tilde[0], ul_action.phy_grant.ul.ncs_dmrs);
+      phy->set_pending_ack(tti + 2*HARQ_DELAY_MS, ue_ul.pusch_cfg.grant.n_prb_tilde[0], ul_action.phy_grant.ul.ncs_dmrs);
     }
 
   } else if (dl_action.generate_ack || uci_data.scheduling_request || uci_data.uci_cqi_len > 0) {
@@ -663,7 +663,7 @@ bool phch_worker::decode_pdcch_ul(mac_interface_phy::mac_grant_t* grant)
   char timestr[64];
   timestr[0]='\0';
 
-  phy->reset_pending_ack(tti + 8); 
+  phy->reset_pending_ack(tti + 2*HARQ_DELAY_MS);
 
   srslte_dci_msg_t dci_msg; 
   srslte_ra_ul_dci_t dci_unpacked;
@@ -776,7 +776,7 @@ void phch_worker::set_uci_sr()
 {
   uci_data.scheduling_request = false; 
   if (phy->sr_enabled) {
-    uint32_t sr_tx_tti = (tti+4)%10240;
+    uint32_t sr_tx_tti = HARQ_TX(tti);
     // Get I_sr parameter   
     if (srslte_ue_ul_sr_send_tti(I_sr, sr_tx_tti)) {
       Info("PUCCH: SR transmission at TTI=%d, I_sr=%d\n", sr_tx_tti, I_sr);
@@ -793,7 +793,7 @@ void phch_worker::set_uci_periodic_cqi()
   int cqi_max       = phy->args->cqi_max;
   
   if (period_cqi.configured && rnti_is_set) {
-    if (period_cqi.ri_idx_present && srslte_ri_send(period_cqi.pmi_idx, period_cqi.ri_idx, (tti+4)%10240)) {
+    if (period_cqi.ri_idx_present && srslte_ri_send(period_cqi.pmi_idx, period_cqi.ri_idx, HARQ_TX(tti))) {
       if (uci_data.uci_ri_len) {
         uci_data.uci_cqi[0] = uci_data.uci_ri;
         uci_data.uci_cqi_len = uci_data.uci_ri_len;
@@ -802,7 +802,7 @@ void phch_worker::set_uci_periodic_cqi()
         uci_data.uci_pmi_len = 0;
         Info("PUCCH: Periodic RI=%d\n", uci_data.uci_cqi[0]);
       }
-    } else if (srslte_cqi_send(period_cqi.pmi_idx, (tti+4)%10240)) {
+    } else if (srslte_cqi_send(period_cqi.pmi_idx, HARQ_TX(tti))) {
       srslte_cqi_value_t cqi_report;
       if (period_cqi.format_is_subband) {
         // TODO: Implement subband periodic reports
@@ -868,8 +868,8 @@ void phch_worker::set_uci_aperiodic_cqi()
 
 bool phch_worker::srs_is_ready_to_send() {
   if (srs_cfg.configured) {
-    if (srslte_refsignal_srs_send_cs(srs_cfg.subframe_config, (tti+4)%10) == 1 && 
-        srslte_refsignal_srs_send_ue(srs_cfg.I_srs, (tti+4)%10240)        == 1)
+    if (srslte_refsignal_srs_send_cs(srs_cfg.subframe_config, HARQ_RX(tti)%10) == 1 &&
+        srslte_refsignal_srs_send_ue(srs_cfg.I_srs, HARQ_TX(tti))        == 1)
     {
       return true; 
     }
@@ -889,7 +889,7 @@ void phch_worker::encode_pusch(srslte_ra_ul_grant_t *grant, uint8_t *payload, ui
   char timestr[64];
   timestr[0]='\0';
   
-  if (srslte_ue_ul_cfg_grant(&ue_ul, grant, (tti+4)%10240, rv, current_tx_nb)) {
+  if (srslte_ue_ul_cfg_grant(&ue_ul, grant, HARQ_TX(tti), rv, current_tx_nb)) {
     Error("Configuring UL grant\n");
   }
   
@@ -919,7 +919,7 @@ void phch_worker::encode_pusch(srslte_ra_ul_grant_t *grant, uint8_t *payload, ui
 #endif
 
   Info("PUSCH: tti_tx=%d, n_prb=%d, rb_start=%d, tbs=%d, mod=%d, mcs=%d, rv_idx=%d, ack=%s, ri=%s, cfo=%.1f Hz%s\n",
-         (tti+4)%10240,
+         HARQ_TX(tti),
          grant->L_prb, grant->n_prb[0], 
          grant->mcs.tbs/8, grant->mcs.mod, grant->mcs.idx, rv,
          uci_data.uci_ack_len>0?(uci_data.uci_ack?"1":"0"):"no",
@@ -950,7 +950,7 @@ void phch_worker::encode_pucch()
     gettimeofday(&t[1], NULL);
 #endif
 
-    if (srslte_ue_ul_pucch_encode(&ue_ul, uci_data, last_dl_pdcch_ncce, (tti+4)%10240, signal_buffer[0])) {
+    if (srslte_ue_ul_pucch_encode(&ue_ul, uci_data, last_dl_pdcch_ncce, HARQ_TX(tti), signal_buffer[0])) {
       Error("Encoding PUCCH\n");
     }
 
@@ -966,7 +966,7 @@ void phch_worker::encode_pucch()
   float gain = set_power(tx_power);  
   
   Info("PUCCH: tti_tx=%d, n_cce=%3d, n_pucch=%d, n_prb=%d, ack=%s%s, ri=%s, pmi=%s%s, sr=%s, cfo=%.1f Hz%s\n",
-         (tti+4)%10240,
+         HARQ_TX(tti),
          last_dl_pdcch_ncce, ue_ul.pucch.last_n_pucch, ue_ul.pucch.last_n_prb, 
        uci_data.uci_ack_len>0?(uci_data.uci_ack?"1":"0"):"no",
        uci_data.uci_ack_len>1?(uci_data.uci_ack_2?"1":"0"):"",
@@ -987,7 +987,7 @@ void phch_worker::encode_srs()
   char timestr[64];
   timestr[0]='\0';
   
-  if (srslte_ue_ul_srs_encode(&ue_ul, (tti+4)%10240, signal_buffer[0])) 
+  if (srslte_ue_ul_srs_encode(&ue_ul, HARQ_TX(tti), signal_buffer[0]))
   {
     Error("Encoding SRS\n");
   }
@@ -1002,7 +1002,7 @@ void phch_worker::encode_srs()
   float gain = set_power(tx_power);
   uint32_t fi = srslte_vec_max_fi((float*) signal_buffer, SRSLTE_SF_LEN_PRB(cell.nof_prb));
   float *f = (float*) signal_buffer;
-  Info("SRS:   power=%.2f dBm, tti_tx=%d%s\n", tx_power, (tti+4)%10240, timestr);
+  Info("SRS:   power=%.2f dBm, tti_tx=%d%s\n", tx_power, HARQ_TX(tti), timestr);
   
 }
 

From dbae016b003c69f651e2c0fe885281ca61b9cb7c Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Mon, 2 Oct 2017 18:16:03 +0100
Subject: [PATCH 25/55] Removed unused vector functions

---
 lib/include/srslte/phy/utils/vector.h |  35 +----
 lib/src/phy/utils/vector.c            | 182 +-------------------------
 lib/src/phy/utils/vector_simd.c       |  69 ----------
 3 files changed, 2 insertions(+), 284 deletions(-)

diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h
index 9b99c6fff..4a5daefb3 100644
--- a/lib/include/srslte/phy/utils/vector.h
+++ b/lib/include/srslte/phy/utils/vector.h
@@ -54,7 +54,6 @@ extern "C" {
 #define SRSLTE_VEC_EMA(data, average, alpha) ((alpha)*(data)+(1-alpha)*(average))
 
 /** Return the sum of all the elements */
-SRSLTE_API int srslte_vec_acc_ii(int *x, uint32_t len);
 SRSLTE_API float srslte_vec_acc_ff(float *x, uint32_t len);
 SRSLTE_API cf_t srslte_vec_acc_cc(cf_t *x, uint32_t len);
 
@@ -77,7 +76,6 @@ SRSLTE_API void srslte_vec_save_file(char *filename, void *buffer, uint32_t len)
 SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, uint32_t len);
 
 /* sum two vectors */
-SRSLTE_API void srslte_vec_sum_ch(uint8_t *x, uint8_t *y, char *z, uint32_t len);
 SRSLTE_API void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len);
 SRSLTE_API void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_sub_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len);
@@ -87,39 +85,16 @@ SRSLTE_API void srslte_vec_sum_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t
 SRSLTE_API void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len); 
 SRSLTE_API void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
 
-/* EMA filter: output=coeff*new_data + (1-coeff)*average */
-SRSLTE_API void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len); 
-
-/* Square distance */
-//SRSLTE_API void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints);
-
-/* scalar addition */
-SRSLTE_API void srslte_vec_sc_add_fff(float *x, float h, float *z, uint32_t len); 
-SRSLTE_API void srslte_vec_sc_add_cfc(cf_t *x, float h, cf_t *z, uint32_t len); 
-SRSLTE_API void srslte_vec_sc_add_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len); 
-SRSLTE_API void srslte_vec_sc_add_sss(int16_t *x, int16_t h, int16_t *z, uint32_t len); 
-
 /* scalar product */
 SRSLTE_API void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len); 
-SRSLTE_API void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len); 
-SRSLTE_API void srslte_vec_sc_div2_sss(short *x, int pow2_div, short *z, uint32_t len); 
 
-/* Normalization */
-SRSLTE_API void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len);
 
 SRSLTE_API void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len);
 SRSLTE_API void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len);
-SRSLTE_API void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len); 
 
-SRSLTE_API void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len);
-SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len); 
-
-SRSLTE_API void srslte_vec_deinterleave_cf(cf_t *x, float *real, float *imag, uint32_t len); 
-SRSLTE_API void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len);
-
-SRSLTE_API void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len);
+SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len);
 
 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
@@ -159,11 +134,6 @@ SRSLTE_API float srslte_vec_corr_ccc(cf_t *x, cf_t *y, uint32_t len);
 /* return the index of the maximum value in the vector */
 SRSLTE_API uint32_t srslte_vec_max_fi(float *x, uint32_t len);
 SRSLTE_API uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len);
-SRSLTE_API int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len);
-SRSLTE_API int16_t srslte_vec_max_abs_star_si(int16_t *x, uint32_t len);
-
-/* maximum between two vectors */
-SRSLTE_API void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len);
 
 /* quantify vector of floats or int16 and convert to uint8_t */
 SRSLTE_API void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len);
@@ -173,9 +143,6 @@ SRSLTE_API void srslte_vec_quant_suc(int16_t *in, uint8_t *out, float gain, int1
 SRSLTE_API void srslte_vec_abs_cf(cf_t *x, float *abs, uint32_t len);
 SRSLTE_API void srslte_vec_abs_square_cf(cf_t *x, float *abs_square, uint32_t len);
 
-/* argument of each vector element */
-SRSLTE_API void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len);
-
 /* Copy 256 bit aligned vector */
 SRSLTE_API void srs_vec_cf_cpy(cf_t *src, cf_t *dst, int len);
 
diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c
index f85dbca0a..3bb7fb08f 100644
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@@ -37,15 +37,6 @@
 
 
 
-int srslte_vec_acc_ii(int *x, uint32_t len) {
-  int i;
-  int z=0;
-  for (i=0;i<len;i++) {
-    z+=x[i];
-  }
-  return z;
-}
-
 // Used in PRACH detector, AGC and chest_dl for noise averaging
 float srslte_vec_acc_ff(float *x, uint32_t len) {
   return srslte_vec_acc_ff_simd(x, len);
@@ -61,16 +52,6 @@ cf_t srslte_vec_acc_cc(cf_t *x, uint32_t len) {
   return srslte_vec_acc_cc_simd(x, len);
 }
 
-#warning Remove function if not used!
-/*void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints) {
-  uint32_t i;
-  cf_t diff; 
-  for (i=0;i<npoints;i++) {
-    diff = symbol - points[i];
-    distance[i] = crealf(diff) * crealf(diff) + cimagf(diff) * cimagf(diff);
-  }
-}*/
-
 void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
   srslte_vec_sub_fff_simd(x, y, z, len);
 }
@@ -97,86 +78,16 @@ void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
   srslte_vec_sum_fff((float*) x,(float*) y,(float*) z,2*len);
 }
 
-void srslte_vec_sum_bbb(uint8_t *x, uint8_t *y, uint8_t *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+y[i];
-  }
-}
-
-void srslte_vec_sc_add_fff(float *x, float h, float *z, uint32_t len) {
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+h;
-  }
-}
-
-void srslte_vec_sc_add_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+ h;
-  }
-}
-
-void srslte_vec_sc_add_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+ h;
-  }
-}
-
-void srslte_vec_sc_add_sss(int16_t *x, int16_t h, int16_t *z, uint32_t len) {
-  int i; 
-  for (i=0;i<len;i++) {
-    z[i] = x[i]+ h;
-  }
-}
-// PSS, PBCH, DEMOD, FFTW, etc. 
+// PSS, PBCH, DEMOD, FFTW, etc.
 void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len) {
   srslte_vec_sc_prod_fff_simd(x, h, z, len);
 }
 
-void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]*h;
-  }
-}
-
-#warning remove function if it is not used
-/*void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) {
-#ifdef LV_HAVE_AVX2
-  srslte_vec_sc_div2_sss_avx2(x, n_rightshift, z, len);
-#else
-#ifdef LV_HAVE_SSE
-  srslte_vec_sc_div2_sss_sse(x, n_rightshift, z, len);
-#else
-  int i;
-  int pow2_div = 1<<n_rightshift;
-  for (i=0;i<len;i++) {
-    z[i] = x[i]/pow2_div;
-  }
-#endif
-#endif
-}*/
-
-// TODO: Improve this implementation
-void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
-  // We should use fabs() here but is statistically should be similar
-  float *xp = (float*) x; 
-  uint32_t idx = srslte_vec_max_fi(xp, 2*len);
-  float max = xp[idx]; 
-
-  // Normalize before TX 
-  srslte_vec_sc_prod_cfc(x, amplitude/max, y, len);
-}
-
 // Used throughout 
 void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) { 
   srslte_vec_sc_prod_cfc_simd(x,h,z,len);
 }
 
-
 // Chest UL 
 void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
   srslte_vec_sc_prod_ccc_simd(x,h,z,len);
@@ -190,63 +101,14 @@ void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len) {
   }
 }
 
-
-void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    z[i] = ((int16_t) x[i]);
-  }
-}
-
 void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
   srslte_vec_convert_fi_simd(x, z, scale, len);
 }
 
-void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
-  for (int i=0;i<len;i++) {
-    y[lut[i]] = x[i];
-  }
-}
-
 void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
   srslte_vec_lut_sss_simd(x, lut, y, len);
 }
 
-void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_INTERLEAVE_FUNCTION
-  volk_32f_x2_interleave_32fc(x, real, imag, len);
-#else 
-  int i;
-  for (i=0;i<len;i++) {
-    x[i] = real[i] + _Complex_I*imag[i];
-  }
-#endif
-}
-
-void srslte_vec_deinterleave_cf(cf_t *x, float *real, float *imag, uint32_t len) {
-#ifdef HAVE_VOLK_DEINTERLEAVE_FUNCTION
-  volk_32fc_deinterleave_32f_x2(real, imag, x, len);
-#else 
-  int i;
-  for (i=0;i<len;i++) {
-    real[i] = __real__ x[i];
-    imag[i] = __imag__ x[i];
-  }
-#endif
-}
-
-void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    real[i] = __real__ x[i];
-  }
-}
-
-/* Note: We align memory to 32 bytes (for AVX2 compatibility) 
- * because in some cases volk can incorrectly detect the architecture. 
- * This could be inefficient for SSE or non-SIMD platforms but shouldn't 
- * be a huge problem. 
- */
 void *srslte_vec_malloc(uint32_t size) {
   void *ptr;
   if (posix_memalign(&ptr, SRSLTE_SIMD_BIT_ALIGN, size)) {
@@ -493,52 +355,10 @@ void srslte_vec_abs_square_cf(cf_t *x, float *abs_square, uint32_t len) {
   srslte_vec_abs_square_cf_simd(x,abs_square,len);
 }
 
-
-void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
-  int i;
-  for (i=0;i<len;i++) {
-    arg[i] = cargf(x[i]);
-  }
-}
-
 uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
   return srslte_vec_max_fi_simd(x, len);
 }
 
-int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
-  uint32_t i;
-  int16_t m=-INT16_MIN;
-  for (i=0;i<len;i++) {
-    if (x[i]>m) {
-      m=x[i];
-    }
-  }
-  return m;
-}
-
-int16_t srslte_vec_max_abs_star_si(int16_t *x, uint32_t len) {
-  uint32_t i;
-  int16_t m=-INT16_MIN;
-  for (i=0;i<len;i++) {
-    if (abs(x[i])>m) {
-      m=abs(x[i]);
-    }
-  }
-  return m;
-}
-
-void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {
-  uint32_t i; 
-  for (i=0;i<len;i++) {
-    if (x[i] > y[i]) {
-      z[i] = x[i]; 
-    } else {
-      z[i] = y[i]; 
-    }
-  }
-}
-
-
 // CP autocorr
 uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
   return srslte_vec_max_ci_simd(x, len);
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index 109c99717..0294bd1af 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -162,75 +162,6 @@ void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
   }
 }
 
-
-
-
-#warning remove function if it is not used
-/*
-void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_SSE
-  unsigned int number = 0;
-  const unsigned int points = len / 8;
-
-  const __m128i* xPtr = (const __m128i*) x;
-  __m128i* zPtr = (__m128i*) z;
-
-  __m128i xVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm_load_si128(xPtr);
-    
-    zVal = _mm_srai_epi16(xVal, k);                 
-      
-    _mm_store_si128(zPtr, zVal);
-
-    xPtr ++;
-    zPtr ++;
-  }
-
-  number = points * 8;
-  short divn = (1<<k);
-  for(;number < len; number++){
-    z[number] = x[number] / divn;
-  }
-#endif
-}*/
-
-#warning remove function if it is not used
-/*
-void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
-{
-#ifdef LV_HAVE_AVX2
-  unsigned int number = 0;
-  const unsigned int points = len / 16;
-
-  const __m256i* xPtr = (const __m256i*) x;
-  __m256i* zPtr = (__m256i*) z;
-
-  __m256i xVal, zVal;
-  for(;number < points; number++){
-
-    xVal = _mm256_load_si256(xPtr);
-    
-    zVal = _mm256_srai_epi16(xVal, k);                 
-      
-    _mm256_store_si256(zPtr, zVal); 
-
-    xPtr ++;
-    zPtr ++;
-  }
-
-  number = points * 16;
-  short divn = (1<<k);
-  for(;number < len; number++){
-    z[number] = x[number] / divn;
-  }
-#endif
-}*/
-
-
-
 /* No improvement with AVX */
 void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len) {
   int i = 0;

From ca0cf017d6a71775c9a842eefef410229a494202 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 5 Oct 2017 16:52:02 +0200
Subject: [PATCH 26/55] Now working with variable HARQ scheduling

---
 lib/include/srslte/common/common.h |  14 +-
 srsenb/hdr/phy/phch_worker.h       |   4 +-
 srsenb/src/mac/scheduler.cc        |   4 +-
 srsenb/src/mac/scheduler_metric.cc |  18 +-
 srsenb/src/mac/scheduler_ue.cc     |  12 +-
 srsenb/src/phy/phch_common.cc      |  10 +-
 srsenb/src/phy/phch_worker.cc      | 367 +++++++++++++++--------------
 srsenb/src/phy/txrx.cc             |   2 +-
 srsue/hdr/mac/mux.h                |   3 +-
 srsue/src/phy/phch_common.cc       |  12 +-
 srsue/src/phy/phch_worker.cc       |  41 ++--
 11 files changed, 260 insertions(+), 227 deletions(-)

diff --git a/lib/include/srslte/common/common.h b/lib/include/srslte/common/common.h
index c6ad06402..9d3d56569 100644
--- a/lib/include/srslte/common/common.h
+++ b/lib/include/srslte/common/common.h
@@ -44,13 +44,15 @@
 #define SRSLTE_N_DRB           8
 #define SRSLTE_N_RADIO_BEARERS 11
 
-#define HARQ_DELAY_MS 4
-#define MSG3_DELAY_MS 6
-#define HARQ_TX(tti) ((tti+HARQ_DELAY_MS)%10240)
-#define HARQ_RX(tti) ((tti+(2*HARQ_DELAY_MS))%10240)
+#define HARQ_DELAY_MS   6
+#define MSG3_DELAY_MS   6
+#define TTI_TX(tti)     ((tti+HARQ_DELAY_MS)%10240)
+#define TTI_RX_ACK(tti) ((tti+(2*HARQ_DELAY_MS))%10240)
 
-#define TTIMOD_SZ   10
-#define TTIMOD(tti)  (tti%TTIMOD_SZ)
+#define TTIMOD_SZ       20
+#define TTIMOD(tti)     (tti%TTIMOD_SZ)
+
+#define ASYNC_DL_SCHED  (HARQ_DELAY_MS <= 4)
 
 // Cat 3 UE - Max number of DL-SCH transport block bits received within a TTI
 // 3GPP 36.306 Table 4.1.1
diff --git a/srsenb/hdr/phy/phch_worker.h b/srsenb/hdr/phy/phch_worker.h
index a84045920..3194751e8 100644
--- a/srsenb/hdr/phy/phch_worker.h
+++ b/srsenb/hdr/phy/phch_worker.h
@@ -89,9 +89,9 @@ private:
 
   cf_t          *signal_buffer_rx; 
   cf_t          *signal_buffer_tx; 
-  uint32_t       tti_rx, tti_tx, tti_sched_ul;
+  uint32_t       tti_rx, tti_tx_dl, tti_tx_ul;
   uint32_t       sf_rx, sf_tx, tx_mutex_cnt;
-  uint32_t       t_rx, t_tx, t_sched_ul;
+  uint32_t       t_rx, t_tx_dl, t_tx_ul;
   srslte_enb_dl_t enb_dl;
   srslte_enb_ul_t enb_ul;
   
diff --git a/srsenb/src/mac/scheduler.cc b/srsenb/src/mac/scheduler.cc
index 73f121424..ab499efb0 100644
--- a/srsenb/src/mac/scheduler.cc
+++ b/srsenb/src/mac/scheduler.cc
@@ -677,14 +677,14 @@ int sched::ul_sched(uint32_t tti, srsenb::sched_interface::ul_sched_res_t* sched
   pthread_mutex_lock(&mutex);
 
   /* If dl_sched() not yet called this tti (this tti is +4ms advanced), reset CCE state */
-  if (HARQ_TX(current_tti) != tti) {
+  if (TTI_TX(current_tti) != tti) {
     bzero(used_cce, MAX_CCE*sizeof(bool));    
   }
   
   /* Initialize variables */
   current_tti = tti; 
   sfn = tti/10;
-  if (tti > 4) {
+  if (tti > HARQ_DELAY_MS) {
     sf_idx = (tti-HARQ_DELAY_MS)%10;
   } else {
     sf_idx = (tti+10240-HARQ_DELAY_MS)%10;
diff --git a/srsenb/src/mac/scheduler_metric.cc b/srsenb/src/mac/scheduler_metric.cc
index 708ab2dd8..6c50009f7 100644
--- a/srsenb/src/mac/scheduler_metric.cc
+++ b/srsenb/src/mac/scheduler_metric.cc
@@ -142,8 +142,12 @@ dl_harq_proc* dl_metric_rr::get_user_allocation(sched_ue *user)
   dl_harq_proc *h = user->get_pending_dl_harq(current_tti);
 
   // Time-domain RR scheduling
+#if ASYNC_DL_SCHED
   if (pending_data || h) {
-    if (nof_users_with_data) {    
+#else
+  if (pending_data || (h && !h->is_empty())) {
+#endif
+    if (nof_users_with_data) {
       if (nof_users_with_data == 2) {
       }
       if ((current_tti%nof_users_with_data) != user->ue_idx) {      
@@ -153,7 +157,11 @@ dl_harq_proc* dl_metric_rr::get_user_allocation(sched_ue *user)
   }
   
   // Schedule retx if we have space 
+#if ASYNC_DL_SCHED
   if (h) {
+#else
+  if (h && !h->is_empty()) {
+#endif
     uint32_t retx_mask = h->get_rbgmask();
     // If can schedule the same mask, do it
     if (!allocation_is_valid(retx_mask)) {
@@ -170,10 +178,14 @@ dl_harq_proc* dl_metric_rr::get_user_allocation(sched_ue *user)
       }
     }
   } 
-  // If could not schedule the reTx, or there wasn't any pending retx, find an empty PID 
+  // If could not schedule the reTx, or there wasn't any pending retx, find an empty PID
+#if ASYNC_DL_SCHED
   h = user->get_empty_dl_harq(); 
   if (h) {
-    // Allocate resources based on pending data 
+#else
+  if (h && h->is_empty()) {
+#endif
+    // Allocate resources based on pending data
     if (pending_data) {
       uint32_t pending_rb = user->get_required_prb_dl(pending_data, nof_ctrl_symbols);
       uint32_t newtx_mask = 0; 
diff --git a/srsenb/src/mac/scheduler_ue.cc b/srsenb/src/mac/scheduler_ue.cc
index 525494ad2..7f4042849 100644
--- a/srsenb/src/mac/scheduler_ue.cc
+++ b/srsenb/src/mac/scheduler_ue.cc
@@ -248,7 +248,7 @@ bool sched_ue::get_pucch_sched(uint32_t current_tti, uint32_t prb_idx[2])
   
   // First check if it has pending ACKs 
   for (int i=0;i<SCHED_MAX_HARQ_PROC;i++) {
-    if (HARQ_TX(dl_harq[i].get_tti()) == current_tti) {
+    if (TTI_TX(dl_harq[i].get_tti()) == current_tti) {
       uint32_t n_pucch = srslte_pucch_get_npucch(dl_harq[i].get_n_cce(), SRSLTE_PUCCH_FORMAT_1A, has_sr, &pucch_sched);
       if (prb_idx) {
         for (int i=0;i<2;i++) {
@@ -288,7 +288,7 @@ bool sched_ue::get_pucch_sched(uint32_t current_tti, uint32_t prb_idx[2])
 int sched_ue::set_ack_info(uint32_t tti, bool ack)
 {
   for (int i=0;i<SCHED_MAX_HARQ_PROC;i++) {
-    if (HARQ_TX(dl_harq[i].get_tti()) == tti) {
+    if (TTI_TX(dl_harq[i].get_tti()) == tti) {
       Debug("SCHED: Set ACK=%d for rnti=0x%x, pid=%d, tti=%d\n", ack, rnti, i, tti);
       dl_harq[i].set_ack(ack); 
       return dl_harq[i].get_tbs();
@@ -656,6 +656,7 @@ bool sched_ue::is_sr_triggered()
 /* Gets HARQ process with oldest pending retx */
 dl_harq_proc* sched_ue::get_pending_dl_harq(uint32_t tti)
 {
+#if ASYNC_DL_SCHED
   int oldest_idx=-1; 
   uint32_t oldest_tti = 0; 
   for (int i=0;i<SCHED_MAX_HARQ_PROC;i++) {
@@ -671,7 +672,10 @@ dl_harq_proc* sched_ue::get_pending_dl_harq(uint32_t tti)
     return &dl_harq[oldest_idx]; 
   } else {
     return NULL; 
-  }  
+  }
+#else
+  return &dl_harq[tti%SCHED_MAX_HARQ_PROC];
+#endif
 }
 
 dl_harq_proc* sched_ue::get_empty_dl_harq()
@@ -681,7 +685,7 @@ dl_harq_proc* sched_ue::get_empty_dl_harq()
       return &dl_harq[i]; 
     }
   }
-  return NULL; 
+  return NULL;
 }
 
 ul_harq_proc* sched_ue::get_ul_harq(uint32_t tti)
diff --git a/srsenb/src/phy/phch_common.cc b/srsenb/src/phy/phch_common.cc
index 719064324..c4b94153c 100644
--- a/srsenb/src/phy/phch_common.cc
+++ b/srsenb/src/phy/phch_common.cc
@@ -104,7 +104,7 @@ void phch_common::ack_clear(uint32_t sf_idx)
 
 void phch_common::ack_add_rnti(uint16_t rnti)
 {
-  for (int sf_idx=0;sf_idx<10;sf_idx++) {
+  for (int sf_idx=0;sf_idx<TTIMOD_SZ;sf_idx++) {
     pending_ack[rnti].is_pending[sf_idx] = false; 
   }
 }
@@ -126,14 +126,18 @@ bool phch_common::ack_is_pending(uint32_t sf_idx, uint16_t rnti, uint32_t *last_
 {
   if (pending_ack.count(rnti)) {
     bool ret = pending_ack[rnti].is_pending[sf_idx];  
-    pending_ack[rnti].is_pending[sf_idx] = false; 
+    pending_ack[rnti].is_pending[sf_idx] = false;
+
+    if (ret) {
+
+    }
     
     if (ret && last_n_pdcch) {
       *last_n_pdcch = pending_ack[rnti].n_pdcch[sf_idx];
     }
     return ret; 
   } else {
-    return false; 
+    return false;
   }
 }
 
diff --git a/srsenb/src/phy/phch_worker.cc b/srsenb/src/phy/phch_worker.cc
index 9cc87ddd9..52b617fd5 100644
--- a/srsenb/src/phy/phch_worker.cc
+++ b/srsenb/src/phy/phch_worker.cc
@@ -176,16 +176,16 @@ cf_t* phch_worker::get_buffer_rx()
 void phch_worker::set_time(uint32_t tti_, uint32_t tx_mutex_cnt_, srslte_timestamp_t tx_time_)
 {
   tti_rx       = tti_; 
-  tti_tx       = HARQ_TX(tti_rx);
-  tti_sched_ul = HARQ_RX(tti_rx);
-  
+  tti_tx_dl    = TTI_TX(tti_rx);
+  tti_tx_ul    = TTI_RX_ACK(tti_rx);
+
   sf_rx        = tti_rx%10;
-  sf_tx        = tti_tx%10;
-  
-  t_tx         = TTIMOD(tti_tx);
+  sf_tx        = tti_tx_dl%10;
+
+  t_tx_dl      = TTIMOD(tti_tx_dl);
   t_rx         = TTIMOD(tti_rx);
-  t_sched_ul   = TTIMOD(tti_sched_ul);
-  
+  t_tx_ul      = TTIMOD(tti_tx_ul);
+
   tx_mutex_cnt = tx_mutex_cnt_;
   memcpy(&tx_time, &tx_time_, sizeof(srslte_timestamp_t));
 }
@@ -194,16 +194,16 @@ int phch_worker::add_rnti(uint16_t rnti)
 {
 
   if (srslte_enb_dl_add_rnti(&enb_dl, rnti)) {
-    return -1; 
+    return -1;
   }
   if (srslte_enb_ul_add_rnti(&enb_ul, rnti)) {
-    return -1; 
+    return -1;
   }
 
-  // Create user 
-  ue_db[rnti].rnti = rnti; 
-    
-  return SRSLTE_SUCCESS; 
+  // Create user
+  ue_db[rnti].rnti = rnti;
+
+  return SRSLTE_SUCCESS;
 
 }
 
@@ -211,61 +211,61 @@ uint32_t phch_worker::get_nof_rnti() {
   return ue_db.size();
 }
 
-void phch_worker::set_config_dedicated(uint16_t rnti, 
-                                       srslte_uci_cfg_t *uci_cfg, 
+void phch_worker::set_config_dedicated(uint16_t rnti,
+                                       srslte_uci_cfg_t *uci_cfg,
                                        srslte_pucch_sched_t *pucch_sched,
-                                       srslte_refsignal_srs_cfg_t *srs_cfg, 
+                                       srslte_refsignal_srs_cfg_t *srs_cfg,
                                        uint32_t I_sr, bool pucch_cqi, uint32_t pmi_idx, bool pucch_cqi_ack)
 {
-  pthread_mutex_lock(&mutex); 
+  pthread_mutex_lock(&mutex);
   if (ue_db.count(rnti)) {
     pucch_sched->N_pucch_1 = phy->pucch_cfg.n1_pucch_an;
     srslte_enb_ul_cfg_ue(&enb_ul, rnti, uci_cfg, pucch_sched, srs_cfg);
-        
-    ue_db[rnti].I_sr    = I_sr; 
+
+    ue_db[rnti].I_sr    = I_sr;
     ue_db[rnti].I_sr_en = true;
 
     if (pucch_cqi) {
-      ue_db[rnti].pmi_idx = pmi_idx; 
-      ue_db[rnti].cqi_en  = true;       
-      ue_db[rnti].pucch_cqi_ack = pucch_cqi_ack; 
+      ue_db[rnti].pmi_idx = pmi_idx;
+      ue_db[rnti].cqi_en  = true;
+      ue_db[rnti].pucch_cqi_ack = pucch_cqi_ack;
     } else {
-      ue_db[rnti].pmi_idx = 0; 
-      ue_db[rnti].cqi_en  = false;             
+      ue_db[rnti].pmi_idx = 0;
+      ue_db[rnti].cqi_en  = false;
     }
-    
+
   } else {
     Error("Setting config dedicated: rnti=0x%x does not exist\n");
   }
-  pthread_mutex_unlock(&mutex); 
+  pthread_mutex_unlock(&mutex);
 }
 
 void phch_worker::rem_rnti(uint16_t rnti)
 {
-  pthread_mutex_lock(&mutex); 
+  pthread_mutex_lock(&mutex);
   if (ue_db.count(rnti)) {
     ue_db.erase(rnti);
-    
-    srslte_enb_dl_rem_rnti(&enb_dl, rnti); 
+
+    srslte_enb_dl_rem_rnti(&enb_dl, rnti);
     srslte_enb_ul_rem_rnti(&enb_ul, rnti);
-    
-    // remove any pending grant for each subframe 
+
+    // remove any pending grant for each subframe
     for (uint32_t i=0;i<TTIMOD_SZ;i++) {
       for (uint32_t j=0;j<phy->ul_grants[i].nof_grants;j++) {
         if (phy->ul_grants[i].sched_grants[j].rnti == rnti) {
-          phy->ul_grants[i].sched_grants[j].rnti = 0; 
+          phy->ul_grants[i].sched_grants[j].rnti = 0;
         }
       }
       for (uint32_t j=0;j<phy->dl_grants[i].nof_grants;j++) {
         if (phy->dl_grants[i].sched_grants[j].rnti == rnti) {
-          phy->dl_grants[i].sched_grants[j].rnti = 0; 
+          phy->dl_grants[i].sched_grants[j].rnti = 0;
         }
       }
     }
   } else {
     Error("Removing user: rnti=0x%x does not exist\n", rnti);
   }
-  pthread_mutex_unlock(&mutex); 
+  pthread_mutex_unlock(&mutex);
 }
 
 void phch_worker::work_imp()
@@ -275,18 +275,18 @@ void phch_worker::work_imp()
   }
 
   pthread_mutex_lock(&mutex);
-  
+
   mac_interface_phy::ul_sched_t *ul_grants = phy->ul_grants;
-  mac_interface_phy::dl_sched_t *dl_grants = phy->dl_grants; 
-  mac_interface_phy *mac = phy->mac; 
-  
+  mac_interface_phy::dl_sched_t *dl_grants = phy->dl_grants;
+  mac_interface_phy *mac = phy->mac;
+
   log_h->step(tti_rx);
-  
+
   Debug("Worker %d running\n", get_id());
-  
+
   for(std::map<uint16_t, ue>::iterator iter=ue_db.begin(); iter!=ue_db.end(); ++iter) {
     uint16_t rnti = (uint16_t) iter->first;
-    ue_db[rnti].has_grant_tti = -1; 
+    ue_db[rnti].has_grant_tti = -1;
   }
 
   // Process UL signal
@@ -294,51 +294,51 @@ void phch_worker::work_imp()
 
   // Decode pending UL grants for the tti they were scheduled
   decode_pusch(ul_grants[t_rx].sched_grants, ul_grants[t_rx].nof_grants);
-  
+
   // Decode remaining PUCCH ACKs not associated with PUSCH transmission and SR signals
   decode_pucch();
-      
+
   // Get DL scheduling for the TX TTI from MAC
-  if (mac->get_dl_sched(tti_tx, &dl_grants[t_tx]) < 0) {
+  if (mac->get_dl_sched(tti_tx_dl, &dl_grants[t_tx_dl]) < 0) {
     Error("Getting DL scheduling from MAC\n");
     goto unlock;
-  } 
-  
-  if (dl_grants[t_tx].cfi < 1 || dl_grants[t_tx].cfi > 3) {
-    Error("Invalid CFI=%d\n", dl_grants[t_tx].cfi);
+  }
+
+  if (dl_grants[t_tx_dl].cfi < 1 || dl_grants[t_tx_dl].cfi > 3) {
+    Error("Invalid CFI=%d\n", dl_grants[t_tx_dl].cfi);
     goto unlock;
   }
-  
+
   // Get UL scheduling for the TX TTI from MAC
-  if (mac->get_ul_sched(tti_sched_ul, &ul_grants[t_sched_ul]) < 0) {
+  if (mac->get_ul_sched(tti_tx_ul, &ul_grants[t_tx_ul]) < 0) {
     Error("Getting UL scheduling from MAC\n");
     goto unlock;
-  } 
-  
+  }
+
   // Put base signals (references, PBCH, PCFICH and PSS/SSS) into the resource grid
   srslte_enb_dl_clear_sf(&enb_dl);
-  srslte_enb_dl_set_cfi(&enb_dl, dl_grants[t_tx].cfi);
-  srslte_enb_dl_put_base(&enb_dl, tti_tx);
+  srslte_enb_dl_set_cfi(&enb_dl, dl_grants[t_tx_dl].cfi);
+  srslte_enb_dl_put_base(&enb_dl, tti_tx_dl);
+
+  // Put UL/DL grants to resource grid. PDSCH data will be encoded as well.
+  encode_pdcch_dl(dl_grants[t_tx_dl].sched_grants, dl_grants[t_tx_dl].nof_grants);
+  encode_pdcch_ul(ul_grants[t_tx_ul].sched_grants, ul_grants[t_tx_ul].nof_grants);
+  encode_pdsch(dl_grants[t_tx_dl].sched_grants, dl_grants[t_tx_dl].nof_grants);
 
-  // Put UL/DL grants to resource grid. PDSCH data will be encoded as well. 
-  encode_pdcch_dl(dl_grants[t_tx].sched_grants, dl_grants[t_tx].nof_grants);
-  encode_pdcch_ul(ul_grants[t_sched_ul].sched_grants, ul_grants[t_sched_ul].nof_grants);
-  encode_pdsch(dl_grants[t_tx].sched_grants, dl_grants[t_tx].nof_grants);
-  
   // Put pending PHICH HARQ ACK/NACK indications into subframe
-  encode_phich(ul_grants[t_sched_ul].phich, ul_grants[t_sched_ul].nof_phich);
-  
-  // Prepare for receive ACK for DL grants in t_tx+4
-  phy->ack_clear(TTIMOD(HARQ_TX(sf_tx)));
-  for (uint32_t i=0;i<dl_grants[t_tx].nof_grants;i++) {
+  encode_phich(ul_grants[t_tx_ul].phich, ul_grants[t_tx_ul].nof_phich);
+
+  // Prepare for receive ACK for DL grants in t_tx_dl+4
+  phy->ack_clear(TTIMOD(TTI_TX(t_tx_dl)));
+  for (uint32_t i=0;i<dl_grants[t_tx_dl].nof_grants;i++) {
     // SI-RNTI and RAR-RNTI do not have ACK
-    if (dl_grants[t_tx].sched_grants[i].rnti >= SRSLTE_CRNTI_START && dl_grants[t_tx].sched_grants[i].rnti <= SRSLTE_CRNTI_END) {
-      phy->ack_set_pending(TTIMOD(HARQ_TX(sf_tx)), dl_grants[t_tx].sched_grants[i].rnti, dl_grants[t_tx].sched_grants[i].location.ncce);
+    if (dl_grants[t_tx_dl].sched_grants[i].rnti >= SRSLTE_CRNTI_START && dl_grants[t_tx_dl].sched_grants[i].rnti <= SRSLTE_CRNTI_END) {
+      phy->ack_set_pending(TTIMOD(TTI_TX(t_tx_dl)), dl_grants[t_tx_dl].sched_grants[i].rnti, dl_grants[t_tx_dl].sched_grants[i].location.ncce);
     }
   }
-  
+
   // Generate signal and transmit
-  srslte_enb_dl_gen_signal(&enb_dl, signal_buffer_tx);  
+  srslte_enb_dl_gen_signal(&enb_dl, signal_buffer_tx);
   Debug("Sending to radio\n");
   phy->worker_end(tx_mutex_cnt, signal_buffer_tx, SRSLTE_SF_LEN_PRB(phy->cell.nof_prb), tx_time);
 
@@ -347,35 +347,35 @@ void phch_worker::work_imp()
 #endif
 
 #ifdef DEBUG_WRITE_FILE
-  if (tti_tx == 10) {
+  if (tti_tx_dl == 10) {
     fclose(f);
     exit(-1);
   }
-#endif    
-  
+#endif
+
   /* Tell the plotting thread to draw the plots */
 #ifdef ENABLE_GUI
   if ((int) get_id() == plot_worker_id) {
-    sem_post(&plot_sem);    
+    sem_post(&plot_sem);
   }
 #endif
 
 unlock:
-  pthread_mutex_unlock(&mutex); 
+  pthread_mutex_unlock(&mutex);
 
 }
 
 
 int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
 {
-  srslte_uci_data_t uci_data; 
+  srslte_uci_data_t uci_data;
   bzero(&uci_data, sizeof(srslte_uci_data_t));
-  
-  uint32_t wideband_cqi_value = 0; 
-  
-  uint32_t n_rb_ho = 0; 
+
+  uint32_t wideband_cqi_value = 0;
+
+  uint32_t n_rb_ho = 0;
   for (uint32_t i=0;i<nof_pusch;i++) {
-    uint16_t rnti = grants[i].rnti; 
+    uint16_t rnti = grants[i].rnti;
     if (rnti) {
 
     #ifdef LOG_EXECTIME
@@ -386,27 +386,27 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
 
       // Get pending ACKs with an associated PUSCH transmission
       if (phy->ack_is_pending(t_rx, rnti)) {
-        uci_data.uci_ack_len = 1; 
+        uci_data.uci_ack_len = 1;
       }
-      // Configure PUSCH CQI channel 
+      // Configure PUSCH CQI channel
       srslte_cqi_value_t cqi_value;
-      bool cqi_enabled = false; 
+      bool cqi_enabled = false;
       if (ue_db[rnti].cqi_en && srslte_cqi_send(ue_db[rnti].pmi_idx, tti_rx)) {
         cqi_value.type = SRSLTE_CQI_TYPE_WIDEBAND;
-        cqi_enabled = true; 
+        cqi_enabled = true;
       } else if (grants[i].grant.cqi_request) {
         cqi_value.type = SRSLTE_CQI_TYPE_SUBBAND_HL;
         cqi_value.subband_hl.N = (phy->cell.nof_prb > 7) ? srslte_cqi_hl_get_no_subbands(phy->cell.nof_prb) : 0;
-        cqi_enabled = true; 
+        cqi_enabled = true;
       }
       if (cqi_enabled) {
         uci_data.uci_cqi_len = srslte_cqi_size(&cqi_value);
       }
-      
-      // mark this tti as having an ul grant to avoid pucch 
-      ue_db[rnti].has_grant_tti = tti_rx; 
-      
-      srslte_ra_ul_grant_t phy_grant; 
+
+      // mark this tti as having an ul grant to avoid pucch
+      ue_db[rnti].has_grant_tti = tti_rx;
+
+      srslte_ra_ul_grant_t phy_grant;
       int res = -1;
       if (!srslte_ra_ul_dci_to_grant(&grants[i].grant, enb_ul.cell.nof_prb, n_rb_ho, &phy_grant, tti_rx%8)) {
         if (phy_grant.mcs.mod == SRSLTE_MOD_64QAM) {
@@ -414,27 +414,27 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
         }
         phy_grant.Qm = SRSLTE_MIN(phy_grant.Qm, 4);
         res = srslte_enb_ul_get_pusch(&enb_ul, &phy_grant, grants[i].softbuffer,
-                                                rnti, grants[i].rv_idx, 
-                                                grants[i].current_tx_nb, 
-                                                grants[i].data, 
-                                                &uci_data, 
+                                                rnti, grants[i].rv_idx,
+                                                grants[i].current_tx_nb,
+                                                grants[i].data,
+                                                &uci_data,
                                                 sf_rx);
       } else {
         Error("Computing PUSCH grant\n");
-        return SRSLTE_ERROR; 
+        return SRSLTE_ERROR;
       }
-      
+
     #ifdef LOG_EXECTIME
       gettimeofday(&t[2], NULL);
       get_time_interval(t);
       snprintf(timestr, 64, ", dec_time=%4d us", (int) t[0].tv_usec);
     #endif
-      
-      bool crc_res = (res == 0); 
-                   
+
+      bool crc_res = (res == 0);
+
       // Save PHICH scheduling for this user. Each user can have just 1 PUSCH grant per TTI
-      ue_db[rnti].phich_info.n_prb_lowest = enb_ul.pusch_cfg.grant.n_prb_tilde[0];                                           
-      ue_db[rnti].phich_info.n_dmrs       = phy_grant.ncs_dmrs;                                           
+      ue_db[rnti].phich_info.n_prb_lowest = enb_ul.pusch_cfg.grant.n_prb_tilde[0];
+      ue_db[rnti].phich_info.n_dmrs       = phy_grant.ncs_dmrs;
 
       char cqi_str[64];
       if (cqi_enabled) {
@@ -446,8 +446,8 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
         }
         snprintf(cqi_str, 64, ", cqi=%d", wideband_cqi_value);
       }
-      
-      float snr_db  = 10*log10(srslte_chest_ul_get_snr(&enb_ul.chest)); 
+
+      float snr_db  = 10*log10(srslte_chest_ul_get_snr(&enb_ul.chest));
 
       /*
       if (!crc_res && enb_ul.pusch_cfg.grant.L_prb == 1 && enb_ul.pusch_cfg.grant.n_prb[0] == 0 && snr_db > 5) {
@@ -456,8 +456,8 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
         srslte_vec_save_file("d", enb_ul.pusch.d, sizeof(cf_t)*enb_ul.pusch_cfg.nbits.nof_re);
         srslte_vec_save_file("ce2", enb_ul.pusch.ce, sizeof(cf_t)*enb_ul.pusch_cfg.nbits.nof_re);
         srslte_vec_save_file("z", enb_ul.pusch.z, sizeof(cf_t)*enb_ul.pusch_cfg.nbits.nof_re);
-        printf("saved sf_idx=%d, mcs=%d, tbs=%d, rnti=%d, rv=%d, snr=%.1f\n", tti%10, 
-               grants[i].grant.mcs_idx, enb_ul.pusch_cfg.cb_segm.tbs, rnti, grants[i].rv_idx, snr_db); 
+        printf("saved sf_idx=%d, mcs=%d, tbs=%d, rnti=%d, rv=%d, snr=%.1f\n", tti%10,
+               grants[i].grant.mcs_idx, enb_ul.pusch_cfg.cb_segm.tbs, rnti, grants[i].rv_idx, snr_db);
         exit(-1);
       }
       */
@@ -465,120 +465,121 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
           "PUSCH: rnti=0x%x, prb=(%d,%d), tbs=%d, mcs=%d, rv=%d, snr=%.1f dB, n_iter=%d, crc=%s%s%s%s\n",
           rnti, phy_grant.n_prb[0], phy_grant.n_prb[0]+phy_grant.L_prb,
           phy_grant.mcs.tbs/8, phy_grant.mcs.idx, grants[i].grant.rv_idx,
-          snr_db, 
+          snr_db,
           srslte_pusch_last_noi(&enb_ul.pusch),
           crc_res?"OK":"KO",
           uci_data.uci_ack_len>0?(uci_data.uci_ack?", ack=1":", ack=0"):"",
-          uci_data.uci_cqi_len>0?cqi_str:"",         
-          timestr);    
-      
-      // Notify MAC of RL status 
+          uci_data.uci_cqi_len>0?cqi_str:"",
+          timestr);
+
+      // Notify MAC of RL status
       if (grants[i].grant.rv_idx == 0) {
         if (res && snr_db < PUSCH_RL_SNR_DB_TH) {
           Debug("PUSCH: Radio-Link failure snr=%.1f dB\n", snr_db);
           phy->mac->rl_failure(rnti);
         } else {
           phy->mac->rl_ok(rnti);
-        }        
+        }
       }
-      
+
       // Notify MAC new received data and HARQ Indication value
-      phy->mac->crc_info(tti_rx, rnti, phy_grant.mcs.tbs/8, crc_res);    
+      phy->mac->crc_info(tti_rx, rnti, phy_grant.mcs.tbs/8, crc_res);
       if (uci_data.uci_ack_len) {
         phy->mac->ack_info(tti_rx, rnti, uci_data.uci_ack && (crc_res || snr_db > PUSCH_RL_SNR_DB_TH));
       }
-      
-      // Notify MAC of UL SNR and DL CQI 
+
+      // Notify MAC of UL SNR and DL CQI
       if (snr_db >= PUSCH_RL_SNR_DB_TH) {
         phy->mac->snr_info(tti_rx, rnti, snr_db);
       }
       if (uci_data.uci_cqi_len>0 && crc_res) {
         phy->mac->cqi_info(tti_rx, rnti, wideband_cqi_value);
       }
-      
-      // Save metrics stats 
+
+      // Save metrics stats
       ue_db[rnti].metrics_ul(phy_grant.mcs.idx, 0, snr_db, srslte_pusch_last_noi(&enb_ul.pusch));
-    }    
+    }
   }
-  return SRSLTE_SUCCESS; 
+  return SRSLTE_SUCCESS;
 }
 
 
 int phch_worker::decode_pucch()
 {
   srslte_uci_data_t uci_data;
-  
+
   for(std::map<uint16_t, ue>::iterator iter=ue_db.begin(); iter!=ue_db.end(); ++iter) {
     uint16_t rnti = (uint16_t) iter->first;
 
     if (rnti >= SRSLTE_CRNTI_START && rnti <= SRSLTE_CRNTI_END && ue_db[rnti].has_grant_tti != (int) tti_rx) {
-      // Check if user needs to receive PUCCH 
-      bool needs_pucch = false, needs_ack=false, needs_sr=false, needs_cqi=false; 
+      // Check if user needs to receive PUCCH
+      bool needs_pucch = false, needs_ack=false, needs_sr=false, needs_cqi=false;
       uint32_t last_n_pdcch = 0;
       bzero(&uci_data, sizeof(srslte_uci_data_t));
-      
+
       if (ue_db[rnti].I_sr_en) {
         if (srslte_ue_ul_sr_send_tti(ue_db[rnti].I_sr, tti_rx)) {
-          needs_pucch = true; 
-          needs_sr = true; 
-          uci_data.scheduling_request = true; 
+          needs_pucch = true;
+          needs_sr = true;
+          uci_data.scheduling_request = true;
         }
-      }      
+      }
+
       if (phy->ack_is_pending(t_rx, rnti, &last_n_pdcch)) {
-        needs_pucch = true; 
-        needs_ack = true; 
-        uci_data.uci_ack_len = 1; 
+        needs_pucch = true;
+        needs_ack = true;
+        uci_data.uci_ack_len = 1;
       }
       srslte_cqi_value_t cqi_value;
       if (ue_db[rnti].cqi_en && (ue_db[rnti].pucch_cqi_ack || !needs_ack)) {
         if (srslte_cqi_send(ue_db[rnti].pmi_idx, tti_rx)) {
-          needs_pucch = true; 
-          needs_cqi = true; 
-          cqi_value.type = SRSLTE_CQI_TYPE_WIDEBAND; 
+          needs_pucch = true;
+          needs_cqi = true;
+          cqi_value.type = SRSLTE_CQI_TYPE_WIDEBAND;
           uci_data.uci_cqi_len = srslte_cqi_size(&cqi_value);
         }
       }
-      
+
       if (needs_pucch) {
-        if (srslte_enb_ul_get_pucch(&enb_ul, rnti, last_n_pdcch, t_rx, &uci_data)) {
+        if (srslte_enb_ul_get_pucch(&enb_ul, rnti, last_n_pdcch, sf_rx, &uci_data)) {
           fprintf(stderr, "Error getting PUCCH\n");
-          return SRSLTE_ERROR; 
+          return SRSLTE_ERROR;
         }
         if (uci_data.uci_ack_len > 0) {
-          phy->mac->ack_info(tti_rx, rnti, uci_data.uci_ack && (srslte_pucch_get_last_corr(&enb_ul.pucch) >= PUCCH_RL_CORR_TH));      
+          phy->mac->ack_info(tti_rx, rnti, uci_data.uci_ack && (srslte_pucch_get_last_corr(&enb_ul.pucch) >= PUCCH_RL_CORR_TH));
         }
         if (uci_data.scheduling_request) {
-          phy->mac->sr_detected(tti_rx, rnti);                
+          phy->mac->sr_detected(tti_rx, rnti);
         }
-        
+
         char cqi_str[64];
         if (uci_data.uci_cqi_len) {
           srslte_cqi_value_unpack(uci_data.uci_cqi, &cqi_value);
           phy->mac->cqi_info(tti_rx, rnti, cqi_value.wideband.wideband_cqi);
           sprintf(cqi_str, ", cqi=%d", cqi_value.wideband.wideband_cqi);
         }
-        log_h->info("PUCCH: rnti=0x%x, corr=%.2f, n_pucch=%d, n_prb=%d%s%s%s\n", 
-                    rnti, 
+        log_h->info("PUCCH: rnti=0x%x, corr=%.2f, n_pucch=%d, n_prb=%d%s%s%s\n",
+                    rnti,
                     srslte_pucch_get_last_corr(&enb_ul.pucch),
                     enb_ul.pucch.last_n_pucch, enb_ul.pucch.last_n_prb,
-                    needs_ack?(uci_data.uci_ack?", ack=1":", ack=0"):"", 
-                    needs_sr?(uci_data.scheduling_request?", sr=yes":", sr=no"):"", 
-                    needs_cqi?cqi_str:"");                
+                    needs_ack?(uci_data.uci_ack?", ack=1":", ack=0"):"",
+                    needs_sr?(uci_data.scheduling_request?", sr=yes":", sr=no"):"",
+                    needs_cqi?cqi_str:"");
 
 
-        // Notify MAC of RL status 
+        // Notify MAC of RL status
         if (!needs_sr) {
           if (srslte_pucch_get_last_corr(&enb_ul.pucch) < PUCCH_RL_CORR_TH) {
             Debug("PUCCH: Radio-Link failure corr=%.1f\n", srslte_pucch_get_last_corr(&enb_ul.pucch));
             phy->mac->rl_failure(rnti);
           } else {
             phy->mac->rl_ok(rnti);
-          }          
-        }                
+          }
+        }
       }
     }
-  }    
-  return 0; 
+  }
+  return 0;
 }
 
 
@@ -587,15 +588,15 @@ int phch_worker::encode_phich(srslte_enb_dl_phich_t *acks, uint32_t nof_acks)
   for (uint32_t i=0;i<nof_acks;i++) {
     uint16_t rnti = acks[i].rnti;
     if (rnti) {
-      srslte_enb_dl_put_phich(&enb_dl, acks[i].ack, 
-                              ue_db[rnti].phich_info.n_prb_lowest, 
-                              ue_db[rnti].phich_info.n_dmrs, 
+      srslte_enb_dl_put_phich(&enb_dl, acks[i].ack,
+                              ue_db[rnti].phich_info.n_prb_lowest,
+                              ue_db[rnti].phich_info.n_dmrs,
                               sf_tx);
-      
-      Info("PHICH: rnti=0x%x, hi=%d, I_lowest=%d, n_dmrs=%d, tti_tx=%d\n", 
-          rnti, acks[i].ack, 
-          ue_db[rnti].phich_info.n_prb_lowest, 
-          ue_db[rnti].phich_info.n_dmrs, tti_tx);
+
+      Info("PHICH: rnti=0x%x, hi=%d, I_lowest=%d, n_dmrs=%d, tti_tx_dl=%d\n",
+          rnti, acks[i].ack,
+          ue_db[rnti].phich_info.n_prb_lowest,
+          ue_db[rnti].phich_info.n_dmrs, tti_tx_dl);
     }
   }
   return SRSLTE_SUCCESS;
@@ -609,14 +610,14 @@ int phch_worker::encode_pdcch_ul(srslte_enb_ul_pusch_t *grants, uint32_t nof_gra
     if (grants[i].needs_pdcch && rnti) {
       if (srslte_enb_dl_put_pdcch_ul(&enb_dl, &grants[i].grant, grants[i].location, rnti, sf_tx)) {
         fprintf(stderr, "Error putting PUSCH %d\n",i);
-        return SRSLTE_ERROR; 
+        return SRSLTE_ERROR;
       }
 
-      Info("PDCCH: UL DCI Format0  rnti=0x%x, cce_index=%d, L=%d, tpc=%d, tti_tx=%d\n",
-           rnti, grants[i].location.ncce, (1<<grants[i].location.L), grants[i].grant.tpc_pusch, tti_tx);
+      Info("PDCCH: UL DCI Format0  rnti=0x%x, cce_index=%d, L=%d, tpc=%d, tti_tx_dl=%d\n",
+           rnti, grants[i].location.ncce, (1<<grants[i].location.L), grants[i].grant.tpc_pusch, tti_tx_dl);
     }
   }
-  return SRSLTE_SUCCESS; 
+  return SRSLTE_SUCCESS;
 }
 
 int phch_worker::encode_pdcch_dl(srslte_enb_dl_pdsch_t *grants, uint32_t nof_grants)
@@ -624,28 +625,28 @@ int phch_worker::encode_pdcch_dl(srslte_enb_dl_pdsch_t *grants, uint32_t nof_gra
   for (uint32_t i=0;i<nof_grants;i++) {
     uint16_t rnti = grants[i].rnti;
     if (rnti) {
-      srslte_dci_format_t format = SRSLTE_DCI_FORMAT1; 
+      srslte_dci_format_t format = SRSLTE_DCI_FORMAT1;
       switch(grants[i].grant.alloc_type) {
         case SRSLTE_RA_ALLOC_TYPE0:
         case SRSLTE_RA_ALLOC_TYPE1:
-          format = SRSLTE_DCI_FORMAT1; 
+          format = SRSLTE_DCI_FORMAT1;
         break;
         case SRSLTE_RA_ALLOC_TYPE2:
-          format = SRSLTE_DCI_FORMAT1A; 
+          format = SRSLTE_DCI_FORMAT1A;
         break;
       }
       if (srslte_enb_dl_put_pdcch_dl(&enb_dl, &grants[i].grant, format, grants[i].location, rnti, sf_tx)) {
         fprintf(stderr, "Error putting PDCCH %d\n",i);
-        return SRSLTE_ERROR; 
-      }      
-      
+        return SRSLTE_ERROR;
+      }
+
       if (LOG_THIS(rnti)) {
-        Info("PDCCH: DL DCI %s rnti=0x%x, cce_index=%d, L=%d, tti_tx=%d\n", srslte_dci_format_string(format), 
-          rnti, grants[i].location.ncce, (1<<grants[i].location.L), tti_tx);
+        Info("PDCCH: DL DCI %s rnti=0x%x, cce_index=%d, L=%d, tti_tx_dl=%d\n", srslte_dci_format_string(format),
+          rnti, grants[i].location.ncce, (1<<grants[i].location.L), tti_tx_dl);
       }
     }
   }
-  return 0; 
+  return 0;
 }
 
 int phch_worker::encode_pdsch(srslte_enb_dl_pdsch_t *grants, uint32_t nof_grants)
@@ -654,14 +655,14 @@ int phch_worker::encode_pdsch(srslte_enb_dl_pdsch_t *grants, uint32_t nof_grants
     uint16_t rnti = grants[i].rnti;
     if (rnti) {
 
-      bool rnti_is_user = true; 
+      bool rnti_is_user = true;
       if (rnti == SRSLTE_SIRNTI || rnti == SRSLTE_PRNTI || rnti == SRSLTE_MRNTI) {
-        rnti_is_user = false; 
+        rnti_is_user = false;
       }
-      
-      srslte_ra_dl_grant_t phy_grant; 
+
+      srslte_ra_dl_grant_t phy_grant;
       srslte_ra_dl_dci_to_grant(&grants[i].grant, enb_dl.cell.nof_prb, rnti, &phy_grant);
-      
+
       char grant_str[64];
       switch(grants[i].grant.alloc_type) {
         case SRSLTE_RA_ALLOC_TYPE0:
@@ -674,19 +675,19 @@ int phch_worker::encode_pdsch(srslte_enb_dl_pdsch_t *grants, uint32_t nof_grants
           sprintf(grant_str, "rb_start=%d",grants[i].grant.type2_alloc.RB_start);
         break;
       }
-      
-      if (LOG_THIS(rnti)) { 
+
+      if (LOG_THIS(rnti)) {
         uint8_t x = 0;
         uint8_t *ptr = grants[i].data;
         uint32_t len = phy_grant.mcs[0].tbs / (uint32_t) 8;
-        if (!ptr) {          
+        if (!ptr) {
           ptr = &x;
-          len = 1; 
-        }        
+          len = 1;
+        }
         log_h->info_hex(ptr, len,
-                             "PDSCH: rnti=0x%x, l_crb=%2d, %s, harq=%d, tbs=%d, mcs=%d, rv=%d, tti_tx=%d\n",
-                             rnti, phy_grant.nof_prb, grant_str, grants[i].grant.harq_process, 
-                             phy_grant.mcs[0].tbs/8, phy_grant.mcs[0].idx, grants[i].grant.rv_idx, tti_tx);
+                             "PDSCH: rnti=0x%x, l_crb=%2d, %s, harq=%d, tbs=%d, mcs=%d, rv=%d, tti_tx_dl=%d\n",
+                             rnti, phy_grant.nof_prb, grant_str, grants[i].grant.harq_process,
+                             phy_grant.mcs[0].tbs/8, phy_grant.mcs[0].idx, grants[i].grant.rv_idx, tti_tx_dl);
       }
 
       srslte_softbuffer_tx_t *sb[SRSLTE_MAX_CODEWORDS] = {grants[i].softbuffer, NULL};
diff --git a/srsenb/src/phy/txrx.cc b/srsenb/src/phy/txrx.cc
index 9427e3459..fa14b0b82 100644
--- a/srsenb/src/phy/txrx.cc
+++ b/srsenb/src/phy/txrx.cc
@@ -115,7 +115,7 @@ void txrx::run_thread()
                     
       /* Compute TX time: Any transmission happens in TTI+4 thus advance 4 ms the reception time */
       srslte_timestamp_copy(&tx_time, &rx_time);
-      srslte_timestamp_add(&tx_time, 0, 4e-3);
+      srslte_timestamp_add(&tx_time, 0, HARQ_DELAY_MS*1e-3);
       
       Debug("Settting TTI=%d, tx_mutex=%d, tx_time=%d:%f to worker %d\n", 
             tti, tx_mutex_cnt, 
diff --git a/srsue/hdr/mac/mux.h b/srsue/hdr/mac/mux.h
index 1167af752..ab081070c 100644
--- a/srsue/hdr/mac/mux.h
+++ b/srsue/hdr/mac/mux.h
@@ -82,8 +82,7 @@ private:
   
   const static int MIN_RLC_SDU_LEN = 0; 
   const static int MAX_NOF_SUBHEADERS = 20; 
-  const static int MAX_HARQ_PROC = 8; 
-  
+
   std::vector<lchid_t> lch; 
   
   // Keep track of the PIDs that transmitted BSR reports 
diff --git a/srsue/src/phy/phch_common.cc b/srsue/src/phy/phch_common.cc
index 489d27197..549783fd9 100644
--- a/srsue/src/phy/phch_common.cc
+++ b/srsue/src/phy/phch_common.cc
@@ -136,12 +136,16 @@ srslte::radio* phch_common::get_radio()
 void phch_common::set_rar_grant(uint32_t tti, uint8_t grant_payload[SRSLTE_RAR_GRANT_LEN])
 {
   srslte_dci_rar_grant_unpack(&rar_grant, grant_payload);
-  rar_grant_pending = true; 
-  // PUSCH is at n+6 or n+7 and phch_worker assumes default delay of 4 ttis
+  rar_grant_pending = true;
+  int delay = MSG3_DELAY_MS-HARQ_DELAY_MS;
+  if (delay < 0) {
+    fprintf(stderr, "Error MSG3_DELAY_MS can't be lower than HARQ_DELAY_MS\n");
+    delay = 0;
+  }
   if (rar_grant.ul_delay) {
-    rar_grant_tti     = (tti + 3) % 10240; 
+    rar_grant_tti     = (tti + delay + 1) % 10240;
   } else {
-    rar_grant_tti     = (tti + 2) % 10240; 
+    rar_grant_tti     = (tti + delay) % 10240;
   }
 }
 
diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc
index e6c0050e2..adeb41603 100644
--- a/srsue/src/phy/phch_worker.cc
+++ b/srsue/src/phy/phch_worker.cc
@@ -292,6 +292,13 @@ void phch_worker::work_imp()
       }
     }
   }
+
+  // Process RAR before UL to enable zero-delay Msg3
+  bool rar_delivered = false;
+  if (HARQ_DELAY_MS == MSG3_DELAY_MS && dl_mac_grant.rnti_type == SRSLTE_RNTI_RAR) {
+    rar_delivered = true;
+    phy->mac->tb_decoded(dl_ack[0], 0, dl_mac_grant.rnti_type, dl_mac_grant.pid);
+  }
   
   // Decode PHICH 
   bool ul_ack = false;
@@ -313,8 +320,8 @@ void phch_worker::work_imp()
     set_uci_periodic_cqi();
   }
 
-  /* TTI offset for UL is always 4 for LTE */
-  ul_action.tti_offset = 4;
+  /* TTI offset for UL */
+  ul_action.tti_offset = HARQ_DELAY_MS;
 
   /* Send UL grant or HARQ information (from PHICH) to MAC */
   if (ul_grant_available         && ul_ack_available)  {    
@@ -335,7 +342,7 @@ void phch_worker::work_imp()
                  &ul_action.softbuffers[0], ul_action.rv[0], ul_action.rnti, ul_mac_grant.is_from_rar);
     signal_ready = true; 
     if (ul_action.expect_ack) {
-      phy->set_pending_ack(HARQ_RX(tti), ue_ul.pusch_cfg.grant.n_prb_tilde[0], ul_action.phy_grant.ul.ncs_dmrs);
+      phy->set_pending_ack(TTI_RX_ACK(tti), ue_ul.pusch_cfg.grant.n_prb_tilde[0], ul_action.phy_grant.ul.ncs_dmrs);
     }
 
   } else if (dl_action.generate_ack || uci_data.scheduling_request || uci_data.uci_cqi_len > 0) {
@@ -357,7 +364,7 @@ void phch_worker::work_imp()
   if (!dl_action.generate_ack_callback) {
     if (dl_mac_grant.rnti_type == SRSLTE_RNTI_PCH && dl_action.decode_enabled[0]) {
       phy->mac->pch_decoded_ok(dl_mac_grant.n_bytes[0]);
-    } else {
+    } else if (!rar_delivered) {
       for (uint32_t tb = 0; tb < SRSLTE_MAX_TB; tb++) {
         if (dl_action.decode_enabled[tb]) {
           phy->mac->tb_decoded(dl_ack[tb], tb, dl_mac_grant.rnti_type, dl_mac_grant.pid);
@@ -475,7 +482,7 @@ bool phch_worker::decode_pdcch_dl(srsue::mac_interface_phy::mac_grant_t* grant)
     /* Fill MAC grant structure */
     grant->ndi[0] = dci_unpacked.ndi;
     grant->ndi[1] = dci_unpacked.ndi_1;
-    grant->pid = dci_unpacked.harq_process;
+    grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
     grant->n_bytes[0] = grant->phy_grant.dl.mcs[0].tbs / (uint32_t) 8;
     grant->n_bytes[1] = grant->phy_grant.dl.mcs[1].tbs / (uint32_t) 8;
     grant->tti = tti;
@@ -663,7 +670,7 @@ bool phch_worker::decode_pdcch_ul(mac_interface_phy::mac_grant_t* grant)
   char timestr[64];
   timestr[0]='\0';
 
-  phy->reset_pending_ack(HARQ_RX(tti));
+  phy->reset_pending_ack(TTI_RX_ACK(tti));
 
   srslte_dci_msg_t dci_msg; 
   srslte_ra_ul_dci_t dci_unpacked;
@@ -776,7 +783,7 @@ void phch_worker::set_uci_sr()
 {
   uci_data.scheduling_request = false; 
   if (phy->sr_enabled) {
-    uint32_t sr_tx_tti = HARQ_TX(tti);
+    uint32_t sr_tx_tti = TTI_TX(tti);
     // Get I_sr parameter   
     if (srslte_ue_ul_sr_send_tti(I_sr, sr_tx_tti)) {
       Info("PUCCH: SR transmission at TTI=%d, I_sr=%d\n", sr_tx_tti, I_sr);
@@ -793,7 +800,7 @@ void phch_worker::set_uci_periodic_cqi()
   int cqi_max       = phy->args->cqi_max;
   
   if (period_cqi.configured && rnti_is_set) {
-    if (period_cqi.ri_idx_present && srslte_ri_send(period_cqi.pmi_idx, period_cqi.ri_idx, HARQ_TX(tti))) {
+    if (period_cqi.ri_idx_present && srslte_ri_send(period_cqi.pmi_idx, period_cqi.ri_idx, TTI_TX(tti))) {
       if (uci_data.uci_ri_len) {
         uci_data.uci_cqi[0] = uci_data.uci_ri;
         uci_data.uci_cqi_len = uci_data.uci_ri_len;
@@ -802,7 +809,7 @@ void phch_worker::set_uci_periodic_cqi()
         uci_data.uci_pmi_len = 0;
         Info("PUCCH: Periodic RI=%d\n", uci_data.uci_cqi[0]);
       }
-    } else if (srslte_cqi_send(period_cqi.pmi_idx, HARQ_TX(tti))) {
+    } else if (srslte_cqi_send(period_cqi.pmi_idx, TTI_TX(tti))) {
       srslte_cqi_value_t cqi_report;
       if (period_cqi.format_is_subband) {
         // TODO: Implement subband periodic reports
@@ -868,8 +875,8 @@ void phch_worker::set_uci_aperiodic_cqi()
 
 bool phch_worker::srs_is_ready_to_send() {
   if (srs_cfg.configured) {
-    if (srslte_refsignal_srs_send_cs(srs_cfg.subframe_config, HARQ_RX(tti)%10) == 1 &&
-        srslte_refsignal_srs_send_ue(srs_cfg.I_srs, HARQ_TX(tti))        == 1)
+    if (srslte_refsignal_srs_send_cs(srs_cfg.subframe_config, TTI_TX(tti)%10) == 1 &&
+        srslte_refsignal_srs_send_ue(srs_cfg.I_srs, TTI_TX(tti))              == 1)
     {
       return true; 
     }
@@ -889,7 +896,7 @@ void phch_worker::encode_pusch(srslte_ra_ul_grant_t *grant, uint8_t *payload, ui
   char timestr[64];
   timestr[0]='\0';
   
-  if (srslte_ue_ul_cfg_grant(&ue_ul, grant, HARQ_TX(tti), rv, current_tx_nb)) {
+  if (srslte_ue_ul_cfg_grant(&ue_ul, grant, TTI_TX(tti), rv, current_tx_nb)) {
     Error("Configuring UL grant\n");
   }
   
@@ -919,7 +926,7 @@ void phch_worker::encode_pusch(srslte_ra_ul_grant_t *grant, uint8_t *payload, ui
 #endif
 
   Info("PUSCH: tti_tx=%d, n_prb=%d, rb_start=%d, tbs=%d, mod=%d, mcs=%d, rv_idx=%d, ack=%s, ri=%s, cfo=%.1f Hz%s\n",
-         HARQ_TX(tti),
+         TTI_TX(tti),
          grant->L_prb, grant->n_prb[0], 
          grant->mcs.tbs/8, grant->mcs.mod, grant->mcs.idx, rv,
          uci_data.uci_ack_len>0?(uci_data.uci_ack?"1":"0"):"no",
@@ -950,7 +957,7 @@ void phch_worker::encode_pucch()
     gettimeofday(&t[1], NULL);
 #endif
 
-    if (srslte_ue_ul_pucch_encode(&ue_ul, uci_data, last_dl_pdcch_ncce, HARQ_TX(tti), signal_buffer[0])) {
+    if (srslte_ue_ul_pucch_encode(&ue_ul, uci_data, last_dl_pdcch_ncce, TTI_TX(tti), signal_buffer[0])) {
       Error("Encoding PUCCH\n");
     }
 
@@ -966,7 +973,7 @@ void phch_worker::encode_pucch()
   float gain = set_power(tx_power);  
   
   Info("PUCCH: tti_tx=%d, n_cce=%3d, n_pucch=%d, n_prb=%d, ack=%s%s, ri=%s, pmi=%s%s, sr=%s, cfo=%.1f Hz%s\n",
-         HARQ_TX(tti),
+         TTI_TX(tti),
          last_dl_pdcch_ncce, ue_ul.pucch.last_n_pucch, ue_ul.pucch.last_n_prb, 
        uci_data.uci_ack_len>0?(uci_data.uci_ack?"1":"0"):"no",
        uci_data.uci_ack_len>1?(uci_data.uci_ack_2?"1":"0"):"",
@@ -987,7 +994,7 @@ void phch_worker::encode_srs()
   char timestr[64];
   timestr[0]='\0';
   
-  if (srslte_ue_ul_srs_encode(&ue_ul, HARQ_TX(tti), signal_buffer[0]))
+  if (srslte_ue_ul_srs_encode(&ue_ul, TTI_TX(tti), signal_buffer[0]))
   {
     Error("Encoding SRS\n");
   }
@@ -1002,7 +1009,7 @@ void phch_worker::encode_srs()
   float gain = set_power(tx_power);
   uint32_t fi = srslte_vec_max_fi((float*) signal_buffer, SRSLTE_SF_LEN_PRB(cell.nof_prb));
   float *f = (float*) signal_buffer;
-  Info("SRS:   power=%.2f dBm, tti_tx=%d%s\n", tx_power, HARQ_TX(tti), timestr);
+  Info("SRS:   power=%.2f dBm, tti_tx=%d%s\n", tx_power, TTI_TX(tti), timestr);
   
 }
 

From 5208c4c1602b63b814aee04b058f7e2119cf7d8f Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Sun, 8 Oct 2017 00:41:39 +0200
Subject: [PATCH 27/55] Removed unused code

---
 srsenb/src/phy/phch_common.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/srsenb/src/phy/phch_common.cc b/srsenb/src/phy/phch_common.cc
index c4b94153c..061743274 100644
--- a/srsenb/src/phy/phch_common.cc
+++ b/srsenb/src/phy/phch_common.cc
@@ -128,10 +128,6 @@ bool phch_common::ack_is_pending(uint32_t sf_idx, uint16_t rnti, uint32_t *last_
     bool ret = pending_ack[rnti].is_pending[sf_idx];  
     pending_ack[rnti].is_pending[sf_idx] = false;
 
-    if (ret) {
-
-    }
-    
     if (ret && last_n_pdcch) {
       *last_n_pdcch = pending_ack[rnti].n_pdcch[sf_idx];
     }

From 1a5cf45ddacb16271bf70df002ff7ad881a84690 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Mon, 9 Oct 2017 16:30:32 +0200
Subject: [PATCH 28/55] Solved compilation error for SSE (Tested in Atom)

---
 lib/include/srslte/phy/utils/simd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 2590794f2..08eed115f 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -738,7 +738,7 @@ typedef __m256 simd_sel_t;
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
 typedef __m128i simd_i_t;
-typedef __m128i simd_sel_t;
+typedef __m128 simd_sel_t;
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -807,7 +807,7 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
   return _mm256_cmp_ps(a, b, _CMP_GT_OS);
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
-  return  (simd_i_t) _mm_cmpgt_ps(a, b);
+  return  (simd_sel_t) _mm_cmpgt_ps(a, b);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */

From a180b5ebacfdfa8f90add5f764608ee57be3a8fd Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Tue, 10 Oct 2017 12:06:24 +0200
Subject: [PATCH 29/55] Msg3 delay is added to harq delay

---
 lib/include/srslte/common/common.h |  2 +-
 srsenb/src/mac/scheduler.cc        |  2 +-
 srsue/src/phy/phch_common.cc       | 10 ++++------
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/include/srslte/common/common.h b/lib/include/srslte/common/common.h
index fd785fbca..6372af73e 100644
--- a/lib/include/srslte/common/common.h
+++ b/lib/include/srslte/common/common.h
@@ -45,7 +45,7 @@
 #define SRSLTE_N_RADIO_BEARERS 11
 
 #define HARQ_DELAY_MS   4
-#define MSG3_DELAY_MS   6
+#define MSG3_DELAY_MS   2 // Delay added to HARQ_DELAY_MS
 #define TTI_TX(tti)     ((tti+HARQ_DELAY_MS)%10240)
 #define TTI_RX_ACK(tti) ((tti+(2*HARQ_DELAY_MS))%10240)
 
diff --git a/srsenb/src/mac/scheduler.cc b/srsenb/src/mac/scheduler.cc
index 21d60652d..a7e3d12d6 100644
--- a/srsenb/src/mac/scheduler.cc
+++ b/srsenb/src/mac/scheduler.cc
@@ -541,7 +541,7 @@ int sched::dl_sched_rar(dl_sched_rar_t rar[MAX_RAR_LIST])
                 pending_rar[j].rar_tti = 0;            
                 
                 // Save UL resources 
-                uint32_t pending_tti=(current_tti+MSG3_DELAY_MS)%10;
+                uint32_t pending_tti=(current_tti+MSG3_DELAY_MS+HARQ_DELAY_MS)%10;
                 pending_msg3[pending_tti].enabled = true; 
                 pending_msg3[pending_tti].rnti    = pending_rar[j].rnti; 
                 pending_msg3[pending_tti].L       = L_prb; 
diff --git a/srsue/src/phy/phch_common.cc b/srsue/src/phy/phch_common.cc
index 549783fd9..7d2948c1a 100644
--- a/srsue/src/phy/phch_common.cc
+++ b/srsue/src/phy/phch_common.cc
@@ -137,15 +137,13 @@ void phch_common::set_rar_grant(uint32_t tti, uint8_t grant_payload[SRSLTE_RAR_G
 {
   srslte_dci_rar_grant_unpack(&rar_grant, grant_payload);
   rar_grant_pending = true;
-  int delay = MSG3_DELAY_MS-HARQ_DELAY_MS;
-  if (delay < 0) {
-    fprintf(stderr, "Error MSG3_DELAY_MS can't be lower than HARQ_DELAY_MS\n");
-    delay = 0;
+  if (MSG3_DELAY_MS < 0) {
+    fprintf(stderr, "Error MSG3_DELAY_MS can't be negative\n");
   }
   if (rar_grant.ul_delay) {
-    rar_grant_tti     = (tti + delay + 1) % 10240;
+    rar_grant_tti     = (tti + MSG3_DELAY_MS + 1) % 10240;
   } else {
-    rar_grant_tti     = (tti + delay) % 10240;
+    rar_grant_tti     = (tti + MSG3_DELAY_MS) % 10240;
   }
 }
 

From fda886407b7bebe2537ee608687f74f5dae9cf81 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Tue, 10 Oct 2017 12:33:10 +0200
Subject: [PATCH 30/55] Added option to force the DL/UL frequency at the UE

---
 srsenb/enb.conf.example    |  4 +++-
 srsue/hdr/phy/phch_recv.h  |  5 +++++
 srsue/hdr/phy/phy.h        |  3 ++-
 srsue/src/main.cc          |  2 ++
 srsue/src/phy/phch_recv.cc | 29 ++++++++++++++++++++++-------
 srsue/src/phy/phy.cc       |  5 +++++
 srsue/src/ue.cc            |  4 ++++
 srsue/ue.conf.example      |  2 ++
 8 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/srsenb/enb.conf.example b/srsenb/enb.conf.example
index 44e546116..7f023d0c6 100644
--- a/srsenb/enb.conf.example
+++ b/srsenb/enb.conf.example
@@ -45,7 +45,9 @@ drb_config = drb.conf
 # tx_gain: Transmit gain (dB). 
 # rx_gain: Optional receive gain (dB). If disabled, AGC if enabled
 #
-# Optional parameters: 
+# Optional parameters:
+# dl_freq:            Override DL frequency corresponding to dl_earfcn
+# ul_freq:            Override UL frequency corresponding to dl_earfcn (must be set if dl_freq is set)
 # device_name:        Device driver family. Supported options: "auto" (uses first found), "UHD" or "bladeRF" 
 # device_args:        Arguments for the device driver. Options are "auto" or any string. 
 #                     Default for UHD: "recv_frame_size=9232,send_frame_size=9232"
diff --git a/srsue/hdr/phy/phch_recv.h b/srsue/hdr/phy/phch_recv.h
index 044960760..c51622e53 100644
--- a/srsue/hdr/phy/phch_recv.h
+++ b/srsue/hdr/phy/phch_recv.h
@@ -53,6 +53,7 @@ public:
   void set_agc_enable(bool enable);
 
   void    set_earfcn(std::vector<uint32_t> earfcn);
+  void    force_freq(float dl_freq, float ul_freq);
 
   void    reset_sync();
   void    cell_search_start();
@@ -171,6 +172,10 @@ private:
   int    cell_meas_rsrp();
   int    cell_search(int force_N_id_2 = -1);
   bool   set_cell();
+
+  float dl_freq;
+  float ul_freq;
+
 };
 
 } // namespace srsue
diff --git a/srsue/hdr/phy/phy.h b/srsue/hdr/phy/phy.h
index 0c77360b2..07a2713fa 100644
--- a/srsue/hdr/phy/phy.h
+++ b/srsue/hdr/phy/phy.h
@@ -76,6 +76,7 @@ public:
   void write_trace(std::string filename);
 
   void set_earfcn(std::vector<uint32_t> earfcns);
+  void force_freq(float dl_freq, float ul_freq);
 
   /********** RRC INTERFACE ********************/
   void    reset();
@@ -167,7 +168,7 @@ private:
   
   /* Current time advance */
   uint32_t     n_ta;
-    
+
   bool init_(srslte::radio *radio_handler, mac_interface_phy *mac, srslte::log *log_h, bool do_agc, uint32_t nof_workers);
   void set_default_args(phy_args_t *args);
   bool check_args(phy_args_t *args); 
diff --git a/srsue/src/main.cc b/srsue/src/main.cc
index 569083998..47d5f807e 100644
--- a/srsue/src/main.cc
+++ b/srsue/src/main.cc
@@ -65,6 +65,8 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
   common.add_options()
     ("rf.dl_earfcn", bpo::value<uint32_t>(&args->rf.dl_earfcn)->default_value(3400), "Downlink EARFCN")
     ("rf.freq_offset", bpo::value<float>(&args->rf.freq_offset)->default_value(0), "(optional) Frequency offset")
+    ("rf.dl_freq",     bpo::value<float>(&args->rf.dl_freq)->default_value(-1),      "Downlink Frequency (if positive overrides EARFCN)")
+    ("rf.ul_freq",     bpo::value<float>(&args->rf.ul_freq)->default_value(-1),      "Uplink Frequency (if positive overrides EARFCN)")
     ("rf.rx_gain", bpo::value<float>(&args->rf.rx_gain)->default_value(-1), "Front-end receiver gain")
     ("rf.tx_gain", bpo::value<float>(&args->rf.tx_gain)->default_value(-1), "Front-end transmitter gain")
     ("rf.nof_rx_ant", bpo::value<uint32_t>(&args->rf.nof_rx_ant)->default_value(1), "Number of RX antennas")
diff --git a/srsue/src/phy/phch_recv.cc b/srsue/src/phy/phch_recv.cc
index e10c013fe..8642460aa 100644
--- a/srsue/src/phy/phch_recv.cc
+++ b/srsue/src/phy/phch_recv.cc
@@ -63,6 +63,8 @@ double callback_set_rx_gain(void *h, double gain) {
 
 
 phch_recv::phch_recv() {
+  dl_freq = -1;
+  ul_freq = -1;
   bzero(&cell, sizeof(srslte_cell_t));
   running = false;
 }
@@ -445,6 +447,11 @@ void phch_recv::set_earfcn(std::vector<uint32_t> earfcn) {
   this->earfcn = earfcn;
 }
 
+void phch_recv::force_freq(float dl_freq, float ul_freq) {
+  this->dl_freq = dl_freq;
+  this->ul_freq = ul_freq;
+}
+
 bool phch_recv::stop_sync() {
 
   wait_radio_reset();
@@ -568,17 +575,25 @@ bool phch_recv::cell_select(uint32_t earfcn, srslte_cell_t cell) {
 
 bool phch_recv::set_frequency()
 {
-  double dl_freq = 1e6*srslte_band_fd(current_earfcn);
-  double ul_freq = 1e6*srslte_band_fu(srslte_band_ul_earfcn(current_earfcn));
-  if (dl_freq > 0 && ul_freq > 0) {
+  double set_dl_freq = 0;
+  double set_ul_freq = 0;
+
+  if (this->dl_freq > 0 && this->ul_freq > 0) {
+    set_dl_freq = this->dl_freq;
+    set_ul_freq = this->ul_freq;
+  } else {
+    set_dl_freq = 1e6*srslte_band_fd(current_earfcn);
+    set_ul_freq = 1e6*srslte_band_fu(srslte_band_ul_earfcn(current_earfcn));
+  }
+  if (set_dl_freq > 0 && set_ul_freq > 0) {
     log_h->info("SYNC:  Set DL EARFCN=%d, f_dl=%.1f MHz, f_ul=%.1f MHz\n",
-                current_earfcn, dl_freq / 1e6, ul_freq / 1e6);
+                current_earfcn, set_dl_freq / 1e6, set_ul_freq / 1e6);
 
     log_h->console("Searching cell in DL EARFCN=%d, f_dl=%.1f MHz, f_ul=%.1f MHz\n",
-                current_earfcn, dl_freq / 1e6, ul_freq / 1e6);
+                current_earfcn, set_dl_freq / 1e6, set_ul_freq / 1e6);
 
-    radio_h->set_rx_freq(dl_freq);
-    radio_h->set_tx_freq(ul_freq);
+    radio_h->set_rx_freq(set_dl_freq);
+    radio_h->set_tx_freq(set_ul_freq);
     ul_dl_factor = radio_h->get_tx_freq()/radio_h->get_rx_freq();
 
     srslte_ue_sync_reset(&ue_sync);
diff --git a/srsue/src/phy/phy.cc b/srsue/src/phy/phy.cc
index e74107661..df29726d5 100644
--- a/srsue/src/phy/phy.cc
+++ b/srsue/src/phy/phy.cc
@@ -332,6 +332,11 @@ void phy::set_earfcn(vector< uint32_t > earfcns)
   sf_recv.set_earfcn(earfcns);
 }
 
+void phy::force_freq(float dl_freq, float ul_freq)
+{
+  sf_recv.force_freq(dl_freq, ul_freq);
+}
+
 bool phy::sync_status()
 {
   return sf_recv.status_is_sync();
diff --git a/srsue/src/ue.cc b/srsue/src/ue.cc
index 92adaf8dd..3560a78fe 100644
--- a/srsue/src/ue.cc
+++ b/srsue/src/ue.cc
@@ -193,6 +193,10 @@ bool ue::init(all_args_t *args_)
   earfcn_list.push_back(args->rf.dl_earfcn);
   phy.set_earfcn(earfcn_list);
 
+  if (args->rf.dl_freq > 0 && args->rf.ul_freq > 0) {
+    phy.force_freq(args->rf.dl_freq, args->rf.ul_freq);
+  }
+
   printf("Waiting PHY to initialize...\n");
   phy.wait_initialize();
   phy.configure_ul_params();
diff --git a/srsue/ue.conf.example b/srsue/ue.conf.example
index 30c05069a..5d14d3c3c 100644
--- a/srsue/ue.conf.example
+++ b/srsue/ue.conf.example
@@ -9,6 +9,8 @@
 # rx_gain: Optional receive gain (dB). If disabled, AGC if enabled
 #
 # Optional parameters: 
+# dl_freq:            Override DL frequency corresponding to dl_earfcn
+# ul_freq:            Override UL frequency corresponding to dl_earfcn
 # nof_rx_ant:         Number of RX antennas (Default 1, supported 1 or 2)
 # device_name:        Device driver family. Supported options: "auto" (uses first found), "UHD" or "bladeRF" 
 # device_args:        Arguments for the device driver. Options are "auto" or any string. 

From 2019ca31eff3b7190cb6b2f3cf53734f95be9486 Mon Sep 17 00:00:00 2001
From: yagoda <tallonj@tcd.ie>
Date: Fri, 13 Oct 2017 15:35:48 +0100
Subject: [PATCH 31/55] adding neon support for new kernel structure

---
 lib/include/srslte/phy/utils/simd.h | 319 +++++++++++++++++++++++++---
 1 file changed, 292 insertions(+), 27 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 08eed115f..6e4185788 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -121,7 +121,17 @@
 #define SRSLTE_SIMD_C16_SIZE  8
 
 #else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
 
+#define SRSLTE_SIMD_F_SIZE    4
+#define SRSLTE_SIMD_CF_SIZE   4
+
+#define SRSLTE_SIMD_I_SIZE    4
+
+#define SRSLTE_SIMD_S_SIZE    8
+#define SRSLTE_SIMD_C16_SIZE  8
+
+#else /* LV_HAVE_NEON */
 #define SRSLTE_SIMD_F_SIZE    0
 #define SRSLTE_SIMD_CF_SIZE   0
 
@@ -147,6 +157,10 @@ typedef __m256 simd_f_t;
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
 typedef __m128 simd_f_t;
+#else /* HAVE_NEON */
+#ifdef HAVE_NEON
+typedef float32x4 simd_f_t;
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -161,6 +175,10 @@ static inline simd_f_t srslte_simd_f_load(float *ptr) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_load_ps(ptr);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vld1q_f32(ptr);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -175,12 +193,16 @@ static inline simd_f_t srslte_simd_f_loadu(float *ptr) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_loadu_ps(ptr);
+#else /* LV_HAVE_SSE */
+  #ifdef HAVE_NEON
+  return vld1q_f32(ptr);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 
-static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {
+static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {vst1q_f32
 #ifdef LV_HAVE_AVX512
   _mm512_store_ps(ptr, simdreg);
 #else /* LV_HAVE_AVX512 */
@@ -189,6 +211,10 @@ static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   _mm_store_ps(ptr, simdreg);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  vst1q_f32(ptr, simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -203,6 +229,10 @@ static inline void srslte_simd_f_storeu(float *ptr, simd_f_t simdreg) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   _mm_storeu_ps(ptr, simdreg);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  vst1q_f32(ptr, simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -217,6 +247,10 @@ static inline simd_f_t srslte_simd_f_set1(float x) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_set1_ps(x);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vdupq_n_f32(x);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -231,6 +265,10 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_mul_ps(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vmulq_f32(a,b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -245,6 +283,10 @@ static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
   return _mm_rcp_ps(a);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vrecpeq_f32(a);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -265,6 +307,9 @@ static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
 #endif /* LV_HAVE_AVX512 */
 }
 
+
+
+
 static inline simd_f_t srslte_simd_f_sub(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
   return _mm512_sub_ps(a, b);
@@ -274,6 +319,10 @@ static inline simd_f_t srslte_simd_f_sub(simd_f_t a, simd_f_t b) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_sub_ps(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vsubq_f32(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -286,8 +335,12 @@ static inline simd_f_t srslte_simd_f_add(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX2
   return _mm256_add_ps(a, b);
 #else /* LV_HAVE_AVX2 */
-#ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE 
   return _mm_add_ps(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vaddq_f32(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -300,8 +353,12 @@ static inline simd_f_t srslte_simd_f_zero (void) {
 #ifdef LV_HAVE_AVX2
  return _mm256_setzero_ps();
 #else /* LV_HAVE_AVX2 */
-#ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE 
  return _mm_setzero_ps();
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vdupq_n_f32(0);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -344,6 +401,10 @@ static inline simd_f_t srslte_simd_f_hadd(simd_f_t a, simd_f_t b) {
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
   return _mm_hadd_ps(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) );
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -358,6 +419,10 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
   return _mm_sqrt_ps(a);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vrecpeq_f32(vrsqrteq_f32(a));
+#endif /* HAVE_NEON */  
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -368,10 +433,15 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
 
 #if SRSLTE_SIMD_CF_SIZE
 
+#ifdef HAVE_NEON
+   typedef float32x4x2_t simd_cf_t;
+#else
 typedef struct {
   simd_f_t re;
   simd_f_t im;
+  
 } simd_cf_t;
+#endif
 
 /* Complex Single precission Floating point functions */
 static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) {
@@ -399,6 +469,10 @@ static inline simd_cf_t srslte_simd_cfi_load(cf_t *ptr) {
   __m128 i2 = _mm_load_ps((float*)(ptr + 2));
   ret.re = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(2,0,2,0));
   ret.im = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(3,1,3,1));
+#else
+#ifdef HAVE_NEON
+  ret = vld2q_f32((float*)(ptr));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -431,6 +505,10 @@ static inline simd_cf_t srslte_simd_cfi_loadu(cf_t *ptr) {
   __m128 i2 = _mm_loadu_ps((float*)(ptr + 2));
   ret.re = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(2,0,2,0));
   ret.im = _mm_shuffle_ps(i1, i2, _MM_SHUFFLE(3,1,3,1));
+#else
+#ifdef HAVE_NEON
+  ret = vld2q_f32((float*)(ptr));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -450,6 +528,11 @@ static inline simd_cf_t srslte_simd_cf_load(float *re, float *im) {
 #ifdef LV_HAVE_SSE
   ret.re = _mm_load_ps(re);
   ret.im = _mm_load_ps(im);
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  ret.val[0] = vld1q_f32(ptr);
+  ret.val[1] = vld1q_f32(ptr);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -469,6 +552,11 @@ static inline simd_cf_t srslte_simd_cf_loadu(float *re, float *im) {
 #ifdef LV_HAVE_SSE
   ret.re = _mm_loadu_ps(re);
   ret.im = _mm_loadu_ps(im);
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  ret.val[0] = vld1q_f32(ptr);
+  ret.val[1] = vld1q_f32(ptr);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -497,6 +585,10 @@ static inline void srslte_simd_cfi_store(cf_t *ptr, simd_cf_t simdreg) {
 #ifdef LV_HAVE_SSE
   _mm_store_ps((float*)(ptr), _mm_unpacklo_ps(simdreg.re, simdreg.im));
   _mm_store_ps((float*)(ptr + 2), _mm_unpackhi_ps(simdreg.re, simdreg.im));
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  vst2q_f32((float*)(ptr), simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -524,6 +616,10 @@ static inline void srslte_simd_cfi_storeu(cf_t *ptr, simd_cf_t simdreg) {
 #ifdef LV_HAVE_SSE
   _mm_storeu_ps((float*)(ptr), _mm_unpacklo_ps(simdreg.re, simdreg.im));
   _mm_storeu_ps((float*)(ptr + 2), _mm_unpackhi_ps(simdreg.re, simdreg.im));
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  vst2q_f32((float*)(ptr), simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -541,6 +637,11 @@ static inline void srslte_simd_cf_store(float *re, float *im, simd_cf_t simdreg)
 #ifdef LV_HAVE_SSE
   _mm_store_ps((float *) re, simdreg.re);
   _mm_store_ps((float *) im, simdreg.im);
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  vst1q_f32((float *) re, simdreg.val[0]);
+  vst1q_f32((float *) im, simdreg.val[1]);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -558,6 +659,11 @@ static inline void srslte_simd_cf_storeu(float *re, float *im, simd_cf_t simdreg
 #ifdef LV_HAVE_SSE
   _mm_storeu_ps((float *) re, simdreg.re);
   _mm_storeu_ps((float *) im, simdreg.im);
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  vst1q_f32((float *) re, simdreg.val[0]);
+  vst1q_f32((float *) im, simdreg.val[1]);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -576,6 +682,11 @@ static inline simd_cf_t srslte_simd_cf_set1 (cf_t x) {
 #ifdef LV_HAVE_SSE
   ret.re = _mm_set1_ps(__real__ x);
   ret.im = _mm_set1_ps(__imag__ x);
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  re.val[0] = vdupq_n_f32(__real__ x);
+  im.val[1] = vdupq_n_f32(__imag__ x);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -601,6 +712,13 @@ static inline simd_cf_t srslte_simd_cf_prod (simd_cf_t a, simd_cf_t b) {
                       _mm_mul_ps(a.im, b.im));
   ret.im = _mm_add_ps(_mm_mul_ps(a.re, b.im),
                       _mm_mul_ps(a.im, b.re));
+#else
+#ifdef HAVE_NEON
+  ret.val[0] = vsubq_f32(vmulq_f32(a.val[0],b.val[0]),
+                         vmulq_f32(a.val[1],b.val[1]));
+  ret.val[1] = vaddq_f32(vmulq_f32(a.val[0],b.val[1]),
+                         vmulq_f32(a.val[1],b.val[0]));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -626,6 +744,13 @@ static inline simd_cf_t srslte_simd_cf_conjprod (simd_cf_t a, simd_cf_t b) {
                       _mm_mul_ps(a.im, b.im));
   ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re),
                       _mm_mul_ps(a.re, b.im));
+  #else
+#ifdef HAVE_NEON
+  ret.val[0] = vaddq_f32(vmulq_f32(a.val[0],b.val[0]),
+                         vmulq_f32(a.val[1],b.val[1]));
+  ret.val[1] = vsubq_f32(vmulq_f32(a.val[1],b.val[0]),
+                         vmulq_f32(a.val[0],b.val[1]));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -645,6 +770,11 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
 #ifdef LV_HAVE_SSE
   ret.re = _mm_add_ps(a.re, b.re);
   ret.im = _mm_add_ps(a.im, b.im);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  ret.val[0] = vaddq_f32(a.val[0],a.val[0]);
+  ret.val[1] = vaddq_f32(a.val[1],a.val[1]);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -665,6 +795,11 @@ static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
 #ifdef LV_HAVE_SSE
   ret.re = _mm_mul_ps(a.re, b);
   ret.im = _mm_mul_ps(a.im, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  ret.val[0] = vmulq_f32(a.val[0],b);
+  ret.val[1] = vmulq_f32(a.val[1],b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -699,6 +834,16 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
   simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im);
   ret.re = _mm_mul_ps(a.re, rcp);
   ret.im = _mm_mul_ps(neg_a_im, rcp);
+ #else /* LV_HAVE_SSE */
+  #ifdef HAVE_NEON
+  simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]);
+  simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]);
+  simd_f_t mod2 = vaddq_f32(a2re, a2im);
+  simd_f_t rcp = vrecpeq_f32(mod2);
+  simd_f_t neg_a_im = vnegq_f32(vdupq_n_f32(-0.0f), a.val[1]);
+  ret.val[0] = vmulq_f32(a.val[0], rcp);
+  ret.val[1] = vmulq_f32(neg_a_im, rcp);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -718,7 +863,11 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {
 #ifdef LV_HAVE_SSE
   ret.re = _mm_setzero_ps();
   ret.im = _mm_setzero_ps();
-#endif /* LV_HAVE_SSE */
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  ret.val[0] = vdupq_n_f32(0);
+  ret.val[1] = vdupq_n_f32(0);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
   return ret;
@@ -739,6 +888,11 @@ typedef __m256 simd_sel_t;
 #ifdef LV_HAVE_SSE
 typedef __m128i simd_i_t;
 typedef __m128 simd_sel_t;
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+typedef int32x4_t simd_i_t;
+typedef __m128 simd_sel_t;
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -752,6 +906,10 @@ static inline simd_i_t srslte_simd_i_load(int *x) {
 #else
   #ifdef LV_HAVE_SSE
   return _mm_load_si128((__m128i*)x);
+#else
+  #ifdef HAVE_NEON
+  return vld1_s32((int32x4_t*)x);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -780,6 +938,10 @@ static inline simd_i_t srslte_simd_i_set1(int x) {
 #else
   #ifdef LV_HAVE_SSE
   return _mm_set1_epi32(x);
+#else
+  #ifdef HAVE_NEON
+  return vdupq_n_s32(x);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -794,10 +956,14 @@ static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
 #else
 #ifdef LV_HAVE_SSE
   return _mm_add_epi32(a, b);
+#else
+#ifdef HAVE_NEON
+  return vaddq_s32(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
-}
+}vcgtq_f32
 
 static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
@@ -808,6 +974,10 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
   return  (simd_sel_t) _mm_cmpgt_ps(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return  (simd_sel_t) vcgtq_f32(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -841,6 +1011,10 @@ typedef __m256i simd_s_t;
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
 typedef __m128i simd_s_t;
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+typedef int16x8_t simd_s_t;
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -854,6 +1028,10 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_load_si128((__m128i*) ptr);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vld1q_s16((int16x8_t*) ptr);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -867,7 +1045,11 @@ static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) {
   return _mm256_loadu_si256((__m256i*) ptr);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
-  return _mm_loadu_si128((__m128i*) ptr);
+  return _mm_loadu_si128((__m128i*) ptr)
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vld1q_s16((int16x8_t*) ptr);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -882,6 +1064,10 @@ static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   _mm_store_si128((__m128i*) ptr, simdreg);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  vst1q_s16((int16x8_t*) ptr, simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -896,11 +1082,15 @@ static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   _mm_storeu_si128((__m128i*) ptr, simdreg);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  vst1q_s16((int16x8_t*) ptr, simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
-
+vdupq_n_s16
 static inline simd_s_t srslte_simd_s_zero(void) {
 #ifdef LV_HAVE_AVX512
   return _mm512_setzero_si512();
@@ -910,10 +1100,14 @@ static inline simd_s_t srslte_simd_s_zero(void) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_setzero_si128();
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vdupq_n_s16(0);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
-}
+}vmulq_s16
 
 static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) {
 #ifdef LV_HAVE_AVX512
@@ -924,6 +1118,10 @@ static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_mullo_epi16(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vmulq_s16(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -938,6 +1136,10 @@ static inline simd_s_t srslte_simd_s_add(simd_s_t a, simd_s_t b) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_add_epi16(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vaddq_s16(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -952,6 +1154,10 @@ static inline simd_s_t srslte_simd_s_sub(simd_s_t a, simd_s_t b) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_sub_epi16(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vsubq_s16(a, b);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -962,8 +1168,9 @@ static inline simd_s_t srslte_simd_s_sub(simd_s_t a, simd_s_t b) {
 
 #if SRSLTE_SIMD_C16_SIZE
 
-typedef struct {
+typedef 
 #ifdef LV_HAVE_AVX512
+  struct {
     union {
         __m512i m512;
         int16_t i16[32];
@@ -974,24 +1181,32 @@ typedef struct {
     } im;
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
-  union {
-    __m256i m256;
-    int16_t i16[16];
-  } re;
-  union {
-    __m256i m256;
-    int16_t i16[16];
-  } im;
+ struct {
+    union {
+      __m256i m256;
+      int16_t i16[16];
+    } re;
+    union {
+      __m256i m256;
+      int16_t i16[16];
+    } im;
 #else
 #ifdef LV_HAVE_SSE
-  union {
-    __m128i m128;
-    int16_t i16[8];
-  } re;
-  union {
-    __m128i m128;
-    int16_t i16[8];
-  } im;
+ struct {
+    union {
+      __m128i m128;
+      int16_t i16[8];
+    } re;
+    union {
+      __m128i m128;
+      int16_t i16[8];
+    } im;
+#else
+#ifdef HAVE_NEON
+ union {
+     int16x8x2_t m128;
+     int16_t i16[16];
+#endif /* HAVE_NEON */  
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -1017,6 +1232,10 @@ static inline simd_c16_t srslte_simd_c16i_load(c16_t *ptr) {
   __m128i in2 = _mm_load_si128((__m128i*)(ptr + 8));
   ret.re.m128 = _mm_blend_epi16(in1,_mm_shufflelo_epi16(_mm_shufflehi_epi16(in2, 0b10100000), 0b10100000), 0b10101010);
   ret.im.m128 = _mm_blend_epi16(_mm_shufflelo_epi16(_mm_shufflehi_epi16(in1, 0b11110101), 0b11110101),in2, 0b10101010);
+#else /* LV_HAVE_SSE*/
+#ifdef HAVE_NEON
+  ret.m128 = vld2q_s16((int16_t*)(ptr));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -1032,6 +1251,11 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) {
 #ifdef LV_HAVE_SSE
   ret.re.m128 = _mm_load_si128((__m128i*)(re));
   ret.im.m128 = _mm_load_si128((__m128i*)(im));
+#else /* LV_HAVE_SSE*/
+#ifdef HAVE_NEON
+  ret.m128.val[0] = vld1q_s16((int16_t*)(re));
+  ret.m128.val[1] = vld1q_s16((int16_t*)(im));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
   return ret;
@@ -1046,6 +1270,11 @@ static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) {
 #ifdef LV_HAVE_SSE
   ret.re.m128 = _mm_loadu_si128((__m128i*)(re));
   ret.im.m128 = _mm_loadu_si128((__m128i*)(im));
+#else /* LV_HAVE_SSE*/
+#ifdef HAVE_NEON
+  ret.m128.val[0] = vld1q_s16((int16_t*)(re));
+  ret.m128.val[1] = vld1q_s16((int16_t*)(im));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
   return ret;
@@ -1063,6 +1292,10 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
   __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
   _mm_store_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
   _mm_store_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  vst2q_f32((float*)(ptr) ,simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
@@ -1079,6 +1312,10 @@ static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) {
   __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
   _mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
   _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
+#else /*HAVE_NEON*/
+#ifdef HAVE_NEON
+  vst2q_f32((float*)(ptr) ,simdreg);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
@@ -1091,6 +1328,11 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si
 #ifdef LV_HAVE_SSE
   _mm_store_si128((__m128i *) re, simdreg.re.m128);
   _mm_store_si128((__m128i *) im, simdreg.im.m128);
+#else
+#ifdef HAVE_NEON
+  vst1q_f32((int16_t *) re, simdreg.m128.val[0]);
+  vst1q_f32((int16_t *) im, simdreg.m128.val[1]);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
@@ -1103,10 +1345,17 @@ static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t s
 #ifdef LV_HAVE_SSE
   _mm_storeu_si128((__m128i *) re, simdreg.re.m128);
   _mm_storeu_si128((__m128i *) im, simdreg.im.m128);
+#else
+#ifdef HAVE_NEON
+  vst1q_f32((int16_t *) re, simdreg.m128.val[0]);
+  vst1q_f32((int16_t *) im, simdreg.m128.val[1]);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
 
+
+//TODO
 static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
   simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
@@ -1134,11 +1383,16 @@ static inline simd_c16_t srslte_simd_c16_add (simd_c16_t a, simd_c16_t b) {
 #ifdef LV_HAVE_SSE
   ret.re.m128 = _mm_add_epi16(a.re.m128, b.re.m128);
   ret.im.m128 = _mm_add_epi16(a.im.m128, b.im.m128);
+#else
+#ifdef HAVE_NEON
+  ret.m128.val[0] = vaddq_s32(a.m127.val[0],a.m127.val[0]);
+  ret.m128.val[1] = vaddq_s32(a.m127.val[1],a.m127.val[1]);
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
   return ret;
 }
-
+vdupq_n_s16
 static inline simd_c16_t srslte_simd_c16_zero (void) {
   simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
@@ -1148,7 +1402,12 @@ static inline simd_c16_t srslte_simd_c16_zero (void) {
 #ifdef LV_HAVE_SSE
   ret.re.m128 = _mm_setzero_si128();
   ret.im.m128 = _mm_setzero_si128();
-#endif /* LV_HAVE_SSE */
+#else
+#ifdef HAVE_NEON
+  ret.m128.val[0] = vdupq_n_s16(0);
+  ret.m128.val[1] = vdupq_n_s16(0);
+#endif /* HAVE_NEON    */
+#endif /* LV_HAVE_SSE  */
 #endif /* LV_HAVE_AVX2 */
   return ret;
 }
@@ -1182,6 +1441,12 @@ static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
   __m128i ai = _mm_cvttps_epi32(a);
   __m128i bi = _mm_cvttps_epi32(b);
   return _mm_packs_epi32(ai, bi);
+ #else
+#ifdef HAVE_NEON
+  int32x4_t ai = vcvtq_s32_f32(a);
+  int32x4_t bi = vcvtq_s32_f32(b);
+  return (simd_s_t)vcombine_s16(vqmovn_s32(ai), vqmovn_s32(bi));
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */

From f4b9e7311a1daa4b306c9e8bebd3c956db3e6cad Mon Sep 17 00:00:00 2001
From: yagoda <tallonj@tcd.ie>
Date: Tue, 17 Oct 2017 15:51:27 +0000
Subject: [PATCH 32/55] adding neon support to new vector structure

---
 lib/include/srslte/phy/utils/simd.h | 123 +++++++++++++++++++---------
 lib/src/phy/utils/vector_simd.c     |  37 +++++++--
 2 files changed, 116 insertions(+), 44 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 6e4185788..e7820c307 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -33,6 +33,11 @@
 #endif
 #include <immintrin.h>
 #endif /* LV_HAVE_SSE */
+#include <stdio.h>
+
+#ifdef HAVE_NEON
+#include <arm_neon.h>
+#endif
 
 /*
  * SSE Macros
@@ -140,6 +145,7 @@
 #define SRSLTE_SIMD_S_SIZE    0
 #define SRSLTE_SIMD_C16_SIZE  0
 
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -159,7 +165,7 @@ typedef __m256 simd_f_t;
 typedef __m128 simd_f_t;
 #else /* HAVE_NEON */
 #ifdef HAVE_NEON
-typedef float32x4 simd_f_t;
+typedef float32x4_t simd_f_t;
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -188,7 +194,7 @@ static inline simd_f_t srslte_simd_f_loadu(float *ptr) {
 #ifdef LV_HAVE_AVX512
   return _mm512_loadu_ps(ptr);
 #else /* LV_HAVE_AVX512 */
-  #ifdef LV_HAVE_AVX2
+ #ifdef LV_HAVE_AVX2
   return _mm256_loadu_ps(ptr);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
@@ -202,7 +208,7 @@ static inline simd_f_t srslte_simd_f_loadu(float *ptr) {
 #endif /* LV_HAVE_AVX512 */
 }
 
-static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {vst1q_f32
+static inline void srslte_simd_f_store(float *ptr, simd_f_t simdreg) {
 #ifdef LV_HAVE_AVX512
   _mm512_store_ps(ptr, simdreg);
 #else /* LV_HAVE_AVX512 */
@@ -281,11 +287,11 @@ static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
 #ifdef LV_HAVE_AVX2
   return _mm256_rcp_ps(a);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
   return _mm_rcp_ps(a);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  return vrecpeq_f32(a);
+  return vmulq_f32(vrecpeq_f32(a), vrecpsq_f32(vrecpeq_f32(a), a));
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -302,6 +308,22 @@ static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
   return _mm_addsub_ps(a, b);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON
+ float* a_ptr = &a;
+ float* b_ptr = &b;
+ simd_f_t ret;
+ float* c_ptr = &ret;
+ for(int i = 0; i<4;i++){
+   if(i%2==0){
+     c_ptr[i] = a_ptr[i] - b_ptr[i];
+   }else{
+     c_ptr[i] = a_ptr[i] + b_ptr[i];
+   }
+ }
+ 
+ return ret;
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -373,6 +395,10 @@ static inline simd_f_t srslte_simd_f_swap(simd_f_t a) {
 #else /* LV_HAVE_AVX2 */
   #ifdef LV_HAVE_SSE
   return _mm_shuffle_ps(a, a, 0b10110001);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(a)));
+#endif /* HAVE_NEON */  
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -421,7 +447,9 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
   return _mm_sqrt_ps(a);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  return vrecpeq_f32(vrsqrteq_f32(a));
+  float32x4_t sqrt_reciprocal = vrsqrteq_f32(a);
+  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a,sqrt_reciprocal), sqrt_reciprocal),sqrt_reciprocal);
+  return vmulq_f32(a,sqrt_reciprocal);
 #endif /* HAVE_NEON */  
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -530,8 +558,8 @@ static inline simd_cf_t srslte_simd_cf_load(float *re, float *im) {
   ret.im = _mm_load_ps(im);
 #else /*HAVE_NEON*/
 #ifdef HAVE_NEON
-  ret.val[0] = vld1q_f32(ptr);
-  ret.val[1] = vld1q_f32(ptr);
+  ret.val[0] = vld1q_f32(re);
+  ret.val[1] = vld1q_f32(im);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -554,8 +582,8 @@ static inline simd_cf_t srslte_simd_cf_loadu(float *re, float *im) {
   ret.im = _mm_loadu_ps(im);
 #else /*HAVE_NEON*/
 #ifdef HAVE_NEON
-  ret.val[0] = vld1q_f32(ptr);
-  ret.val[1] = vld1q_f32(ptr);
+  ret.val[0] = vld1q_f32(re);
+  ret.val[1] = vld1q_f32(im);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -684,8 +712,8 @@ static inline simd_cf_t srslte_simd_cf_set1 (cf_t x) {
   ret.im = _mm_set1_ps(__imag__ x);
 #else /*HAVE_NEON*/
 #ifdef HAVE_NEON
-  re.val[0] = vdupq_n_f32(__real__ x);
-  im.val[1] = vdupq_n_f32(__imag__ x);
+  ret.val[0] = vdupq_n_f32(__real__ x);
+  ret.val[1] = vdupq_n_f32(__imag__ x);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -772,8 +800,8 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
   ret.im = _mm_add_ps(a.im, b.im);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  ret.val[0] = vaddq_f32(a.val[0],a.val[0]);
-  ret.val[1] = vaddq_f32(a.val[1],a.val[1]);
+  ret.val[0] = vaddq_f32(a.val[0],b.val[0]);
+  ret.val[1] = vaddq_f32(a.val[1],b.val[1]);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -839,8 +867,8 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
   simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]);
   simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]);
   simd_f_t mod2 = vaddq_f32(a2re, a2im);
-  simd_f_t rcp = vrecpeq_f32(mod2);
-  simd_f_t neg_a_im = vnegq_f32(vdupq_n_f32(-0.0f), a.val[1]);
+  simd_f_t rcp = vmulq_f32(vrecpeq_f32(mod2), vrecpsq_f32(vrecpeq_f32(mod2), mod2));
+  simd_f_t neg_a_im = vnegq_f32(a.val[1]);
   ret.val[0] = vmulq_f32(a.val[0], rcp);
   ret.val[1] = vmulq_f32(neg_a_im, rcp);
 #endif /* HAVE_NEON */
@@ -868,6 +896,7 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {
   ret.val[0] = vdupq_n_f32(0);
   ret.val[1] = vdupq_n_f32(0);
 #endif /* HAVE_NEON */
+#endif /* HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
   return ret;
@@ -889,9 +918,9 @@ typedef __m256 simd_sel_t;
 typedef __m128i simd_i_t;
 typedef __m128 simd_sel_t;
 #else /* LV_HAVE_AVX2 */
-#ifdef LV_HAVE_SSE
+#ifdef HAVE_NEON
 typedef int32x4_t simd_i_t;
-typedef __m128 simd_sel_t;
+typedef int32x4_t simd_sel_t;
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -908,7 +937,7 @@ static inline simd_i_t srslte_simd_i_load(int *x) {
   return _mm_load_si128((__m128i*)x);
 #else
   #ifdef HAVE_NEON
-  return vld1_s32((int32x4_t*)x);
+  return vld1q_s32((int*)x);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -924,6 +953,10 @@ static inline void srslte_simd_i_store(int *x, simd_i_t reg) {
 #else
 #ifdef LV_HAVE_SSE
   _mm_store_si128((__m128i*)x, reg);
+#else
+#ifdef HAVE_NEON
+  vst1q_s32((int*)x, reg);
+#endif /*HAVE_NEON*/
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -963,7 +996,7 @@ static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
-}vcgtq_f32
+}
 
 static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
@@ -992,6 +1025,25 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s
 #else
   #ifdef LV_HAVE_SSE
   return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON
+  
+ int* a_ptr = &a;
+ int* b_ptr = &b;
+ simd_i_t ret;
+ int* sel = &selector;
+ 
+ int* c_ptr = &ret;
+ for(int i = 0;i<4;i++)
+ { 
+   if(sel[i] == -1){
+     c_ptr[i] = b_ptr[i];
+   }else{
+     c_ptr[i] = a_ptr[i];
+   }
+ }
+  return ret;
+#endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
@@ -1030,7 +1082,7 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
   return _mm_load_si128((__m128i*) ptr);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  return vld1q_s16((int16x8_t*) ptr);
+  return vld1q_s16(ptr);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -1048,7 +1100,7 @@ static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) {
   return _mm_loadu_si128((__m128i*) ptr)
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  return vld1q_s16((int16x8_t*) ptr);
+  return vld1q_s16(ptr);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -1066,7 +1118,7 @@ static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) {
   _mm_store_si128((__m128i*) ptr, simdreg);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  vst1q_s16((int16x8_t*) ptr, simdreg);
+  vst1q_s16( ptr, simdreg);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -1084,13 +1136,12 @@ static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) {
   _mm_storeu_si128((__m128i*) ptr, simdreg);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
-  vst1q_s16((int16x8_t*) ptr, simdreg);
+  vst1q_s16(ptr, simdreg);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
-vdupq_n_s16
 static inline simd_s_t srslte_simd_s_zero(void) {
 #ifdef LV_HAVE_AVX512
   return _mm512_setzero_si512();
@@ -1107,7 +1158,7 @@ static inline simd_s_t srslte_simd_s_zero(void) {
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
-}vmulq_s16
+}
 
 static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) {
 #ifdef LV_HAVE_AVX512
@@ -1294,7 +1345,7 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
   _mm_store_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
 #else /*HAVE_NEON*/
 #ifdef HAVE_NEON
-  vst2q_f32((float*)(ptr) ,simdreg);
+  vst2q_s16((int16_t*)(ptr) ,simdreg.m128);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -1314,7 +1365,7 @@ static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) {
   _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
 #else /*HAVE_NEON*/
 #ifdef HAVE_NEON
-  vst2q_f32((float*)(ptr) ,simdreg);
+  vst2q_s16((int16_t*)(ptr) ,simdreg.m128);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -1330,8 +1381,8 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si
   _mm_store_si128((__m128i *) im, simdreg.im.m128);
 #else
 #ifdef HAVE_NEON
-  vst1q_f32((int16_t *) re, simdreg.m128.val[0]);
-  vst1q_f32((int16_t *) im, simdreg.m128.val[1]);
+  vst1q_s16((int16_t *) re, simdreg.m128.val[0]);
+  vst1q_s16((int16_t *) im, simdreg.m128.val[1]);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
@@ -1347,15 +1398,13 @@ static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t s
   _mm_storeu_si128((__m128i *) im, simdreg.im.m128);
 #else
 #ifdef HAVE_NEON
-  vst1q_f32((int16_t *) re, simdreg.m128.val[0]);
-  vst1q_f32((int16_t *) im, simdreg.m128.val[1]);
+  vst1q_s16((int16_t *) re, simdreg.m128.val[0]);
+  vst1q_s16((int16_t *) im, simdreg.m128.val[1]);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
 
-
-//TODO
 static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
   simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
@@ -1385,14 +1434,14 @@ static inline simd_c16_t srslte_simd_c16_add (simd_c16_t a, simd_c16_t b) {
   ret.im.m128 = _mm_add_epi16(a.im.m128, b.im.m128);
 #else
 #ifdef HAVE_NEON
-  ret.m128.val[0] = vaddq_s32(a.m127.val[0],a.m127.val[0]);
-  ret.m128.val[1] = vaddq_s32(a.m127.val[1],a.m127.val[1]);
+  ret.m128.val[0] = vaddq_s16(a.m128.val[0],a.m128.val[0]);
+  ret.m128.val[1] = vaddq_s16(a.m128.val[1],a.m128.val[1]);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
   return ret;
 }
-vdupq_n_s16
+
 static inline simd_c16_t srslte_simd_c16_zero (void) {
   simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c
index 0294bd1af..ab281a653 100644
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@@ -751,10 +751,37 @@ void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) {
   }
 }
 
+
+
+int  srslte_vec_sc_prod_ccc_simd2(cf_t *x, cf_t h, cf_t *z, int len)
+{     
+   int i = 0;
+   const unsigned int loops = len / 4;
+#ifdef HAVE_NEON
+  simd_cf_t h_vec; 
+    h_vec.val[0] = srslte_simd_f_set1(__real__ h);
+    h_vec.val[1] = srslte_simd_f_set1(__imag__ h);
+  for (; i < loops; i++) {
+
+    simd_cf_t in =  srslte_simd_cfi_load(&x[i*4]);
+    simd_cf_t temp =  srslte_simd_cf_prod(in, h_vec);
+    srslte_simd_cfi_store(&z[i*4], temp);
+  }
+          
+#endif  
+  i = loops * 4;
+return i;
+}
+
 void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
   int i = 0;
 
 #if SRSLTE_SIMD_F_SIZE
+  
+
+#ifdef HAVE_NEON
+  i = srslte_vec_sc_prod_ccc_simd2(x, h, z, len);
+#else
   const simd_f_t hre = srslte_simd_f_set1(__real__ h);
   const simd_f_t him = srslte_simd_f_set1(__imag__ h);
 
@@ -766,8 +793,8 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
       simd_f_t sw = srslte_simd_f_swap(temp);
       simd_f_t m2 = srslte_simd_f_mul(him, sw);
       simd_f_t r = srslte_simd_f_addsub(m1, m2);
-
       srslte_simd_f_store((float *) &z[i], r);
+
     }
   } else {
     for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
@@ -782,10 +809,11 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
     }
   }
 #endif
-
+#endif
   for (; i < len; i++) {
     z[i] = x[i] * h;
   }
+  
 }
 
 void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len) {
@@ -831,7 +859,6 @@ void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len) {
 
       simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
       z1 = srslte_simd_f_sqrt(z1);
-
       srslte_simd_f_store(&z[i], z1);
     }
   } else {
@@ -966,9 +993,7 @@ uint32_t srslte_vec_max_fi_simd(float *x, int len) {
   if (SRSLTE_IS_ALIGNED(x)) {
     for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
       simd_f_t a = srslte_simd_f_load(&x[i]);
-
       simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
-
       simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
       simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
       simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
@@ -976,9 +1001,7 @@ uint32_t srslte_vec_max_fi_simd(float *x, int len) {
   } else {
     for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
       simd_f_t a = srslte_simd_f_loadu(&x[i]);
-
       simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
-
       simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
       simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
       simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);

From 0504e7a51b42288cd9e7f9b113b7d8bc4cc2f30d Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Wed, 18 Oct 2017 12:49:43 +0200
Subject: [PATCH 33/55] Fixed test for abs value. Solved compilation Neon
 warnings and SSE errors

---
 lib/include/srslte/phy/utils/simd.h  | 16 +++++++--------
 lib/src/phy/utils/test/vector_test.c | 30 ++++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index e7820c307..0c378591e 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -310,10 +310,10 @@ static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
   return _mm_addsub_ps(a, b);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON
- float* a_ptr = &a;
- float* b_ptr = &b;
+ float* a_ptr = (float*) &a;
+ float* b_ptr = (float*) &b;
  simd_f_t ret;
- float* c_ptr = &ret;
+ float* c_ptr = (float*) &ret;
  for(int i = 0; i<4;i++){
    if(i%2==0){
      c_ptr[i] = a_ptr[i] - b_ptr[i];
@@ -1028,12 +1028,12 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON
   
- int* a_ptr = &a;
- int* b_ptr = &b;
+ int* a_ptr = (int*) &a;
+ int* b_ptr = (int*) &b;
  simd_i_t ret;
- int* sel = &selector;
+ int* sel = (int*) &selector;
  
- int* c_ptr = &ret;
+ int* c_ptr = (int*) &ret;
  for(int i = 0;i<4;i++)
  { 
    if(sel[i] == -1){
@@ -1097,7 +1097,7 @@ static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) {
   return _mm256_loadu_si256((__m256i*) ptr);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
-  return _mm_loadu_si128((__m128i*) ptr)
+  return _mm_loadu_si128((__m128i*) ptr);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
   return vld1q_s16(ptr);
diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c
index 8d5b9f2d6..4ebed9862 100644
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@@ -63,12 +63,12 @@ bool verbose = false;
     strncpy(func_name, #X, 32);\
     CODE;\
     passed = (mse < MAX_MSE);\
-    printf("%32s (%5d) ... %7.1f MSamp/s ... %3s Passed\n", func_name, block_size, \
-    (double) block_size*NOF_REPETITIONS/ *timing, passed?"":"Not");\
+    printf("%32s (%5d) ... %7.1f MSamp/s ... %3s Passed (%.6f)\n", func_name, block_size, \
+    (double) block_size*NOF_REPETITIONS/ *timing, passed?"":"Not", mse);\
     return passed;\
 }
 
-#define MALLOC(TYPE, NAME) TYPE *NAME = malloc(sizeof(TYPE)*block_size)
+#define MALLOC(TYPE, NAME) TYPE *NAME = srslte_vec_malloc(sizeof(TYPE)*block_size)
 
 
 static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
@@ -507,7 +507,7 @@ TEST(srslte_vec_abs_cf,
 
   for (int i = 0; i < block_size; i++) {
     gold = sqrtf(crealf(x[i]) * crealf(x[i]) + cimagf(x[i])*cimagf(x[i]));
-    mse += cabsf(gold - z[i]);
+    mse += cabsf(gold - z[i])/block_size;
   }
 
   free(x);
@@ -771,12 +771,27 @@ int main(int argc, char **argv) {
     size_count++;
   }
 
+  char fname[68];
+  FILE *f = NULL;
+  void * p = popen("(date +%g%m%d && hostname) | tr '\\r\\n' '__'", "r");
+  if (p) {
+    fgets(fname, 64, p);
+    strncpy(fname + strnlen(fname, 64) - 1, ".tsv", 4);
+    f = fopen(fname, "w");
+    if (f) printf("Saving benchmark results in '%s'\n", fname);
+  }
+  pclose(p);
+
+
   printf("\n");
   printf("%32s |", "Subroutine/MSps");
+  if (f) fprintf(f, "Subroutine/MSps Vs Vector size\t");
   for (int i = 0; i < size_count; i++) {
     printf(" %7d", sizes[i]);
+    if (f) fprintf(f, "%d\t", sizes[i]);
   }
   printf("  |\n");
+  if (f) fprintf(f, "\n");
 
   for (int j = 0; j < 32; j++) {
     printf("-");
@@ -789,12 +804,19 @@ int main(int argc, char **argv) {
 
   for (int i = 0; i < func_count; i++) {
     printf("%32s | ", func_names[i]);
+    if (f) fprintf(f, "%s\t", func_names[i]);
+
     for (int j = 0; j < size_count; j++) {
       printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+      if (f) fprintf(f, "%.1f\t", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+
       all_passed &= passed[i][j];
     }
     printf(" |\n");
+    if (f) fprintf(f, "\n");
   }
 
+  if (f) fclose(f);
+
   return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
 }

From 12877fe2960a64fb371f6bd8060bd94475d9abdf Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 18 Oct 2017 16:33:19 -0400
Subject: [PATCH 34/55] Missing ip_netmask configuration file

---
 srsue/src/main.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/srsue/src/main.cc b/srsue/src/main.cc
index 47d5f807e..3b0cad2d0 100644
--- a/srsue/src/main.cc
+++ b/srsue/src/main.cc
@@ -122,6 +122,10 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
 
 
     /* Expert section */
+    ("expert.ip_netmask",
+     bpo::value<string>(&args->expert.ip_netmask)->default_value("255.255.255.0"),
+     "Netmask of the tun_srsue device")
+
     ("expert.phy.worker_cpu_mask",
      bpo::value<int>(&args->expert.phy.worker_cpu_mask)->default_value(-1),
      "cpu bit mask (eg 255 = 1111 1111)")

From 739b8fc457f3588f0d9d580c43de978d31329390 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 18 Oct 2017 16:34:18 -0400
Subject: [PATCH 35/55] Restored TA commands

---
 srsue/src/phy/phy.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/srsue/src/phy/phy.cc b/srsue/src/phy/phy.cc
index df29726d5..9760ab374 100644
--- a/srsue/src/phy/phy.cc
+++ b/srsue/src/phy/phy.cc
@@ -200,8 +200,8 @@ void phy::set_timeadv_rar(uint32_t ta_cmd) {
 
 void phy::set_timeadv(uint32_t ta_cmd) {
   n_ta = srslte_N_ta_new(n_ta, ta_cmd);
-  //sf_recv.set_time_adv_sec(((float) n_ta)*SRSLTE_LTE_TS);  
-  Warning("Not supported: Set TA: ta_cmd: %d, n_ta: %d, ta_usec: %.1f\n", ta_cmd, n_ta, ((float) n_ta)*SRSLTE_LTE_TS*1e6);
+  sf_recv.set_time_adv_sec(((float) n_ta)*SRSLTE_LTE_TS);
+  //Warning("Not supported: Set TA: ta_cmd: %d, n_ta: %d, ta_usec: %.1f\n", ta_cmd, n_ta, ((float) n_ta)*SRSLTE_LTE_TS*1e6);
 }
 
 void phy::configure_prach_params()
@@ -308,7 +308,8 @@ void phy::reset()
   pdcch_dl_search_reset();
   for(uint32_t i=0;i<nof_workers;i++) {
     workers[i].reset();
-  }    
+  }
+  workers_common.reset_ul();
 }
 
 uint32_t phy::get_current_tti()

From 240eba2af56b439f4b4b67f432d182bf9c997f29 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 18 Oct 2017 16:49:36 -0400
Subject: [PATCH 36/55] Fix for async dl scheduling

---
 lib/include/srslte/common/common.h |  1 +
 srsue/hdr/mac/mac.h                |  2 +-
 srsue/src/phy/phch_worker.cc       | 11 +++++++----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/lib/include/srslte/common/common.h b/lib/include/srslte/common/common.h
index 6372af73e..fbc703486 100644
--- a/lib/include/srslte/common/common.h
+++ b/lib/include/srslte/common/common.h
@@ -52,6 +52,7 @@
 #define TTIMOD_SZ       (((2*HARQ_DELAY_MS) < 10)?10:20)
 #define TTIMOD(tti)     (tti%TTIMOD_SZ)
 
+#define MOD_N_PROC      (2*HARQ_DELAY_MS-8)
 #define ASYNC_DL_SCHED  (HARQ_DELAY_MS <= 4)
 
 // Cat 3 UE - Max number of DL-SCH transport block bits received within a TTI
diff --git a/srsue/hdr/mac/mac.h b/srsue/hdr/mac/mac.h
index d19f668bf..87bb764e1 100644
--- a/srsue/hdr/mac/mac.h
+++ b/srsue/hdr/mac/mac.h
@@ -135,7 +135,7 @@ private:
   demux         demux_unit; 
   
   /* DL/UL HARQ */
-  dl_harq_entity<MAC_NOF_HARQ_PROC, mac_grant_t, tb_action_dl_t, srslte_phy_grant_t> dl_harq;
+  dl_harq_entity<MAC_NOF_HARQ_PROC+MOD_N_PROC, mac_grant_t, tb_action_dl_t, srslte_phy_grant_t> dl_harq;
   ul_harq_entity<MAC_NOF_HARQ_PROC, mac_grant_t, tb_action_ul_t, srslte_phy_grant_t> ul_harq;
   
   /* MAC Uplink-related Procedures */
diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc
index f89696015..a32bb7537 100644
--- a/srsue/src/phy/phch_worker.cc
+++ b/srsue/src/phy/phch_worker.cc
@@ -65,8 +65,7 @@ phch_worker::phch_worker() : tr_exec(10240)
   cell_initiated  = false; 
   pregen_enabled  = false; 
   trace_enabled   = false;
-
-  reset();  
+  reset();
 }
 
 
@@ -97,7 +96,7 @@ void phch_worker::reset()
   bzero(&period_cqi, sizeof(srslte_cqi_periodic_cfg_t));
   I_sr = 0; 
   rnti_is_set     = false; 
-  rar_cqi_request = false; 
+  rar_cqi_request = false;
   cfi = 0;
 }
 
@@ -482,7 +481,11 @@ bool phch_worker::decode_pdcch_dl(srsue::mac_interface_phy::mac_grant_t* grant)
     /* Fill MAC grant structure */
     grant->ndi[0] = dci_unpacked.ndi;
     grant->ndi[1] = dci_unpacked.ndi_1;
-    grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
+    if (tti < MOD_N_PROC) {
+      grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:tti+(2*HARQ_DELAY_MS);
+    } else {
+      grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
+    }
     grant->n_bytes[0] = grant->phy_grant.dl.mcs[0].tbs / (uint32_t) 8;
     grant->n_bytes[1] = grant->phy_grant.dl.mcs[1].tbs / (uint32_t) 8;
     grant->tti = tti;

From 177f36fc8c1fa7c31667cd3bc8baa3d2c5548af9 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 18 Oct 2017 16:50:22 -0400
Subject: [PATCH 37/55] Improved cell reestablishment procedure timers (still
 not working the authentication)

---
 srsue/hdr/phy/phch_recv.h    |  4 ++--
 srsue/hdr/upper/rrc.h        |  5 ++++-
 srsue/src/mac/mac.cc         |  3 ++-
 srsue/src/phy/phch_common.cc |  1 +
 srsue/src/upper/rrc.cc       | 28 ++++++++++++++++------------
 5 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/srsue/hdr/phy/phch_recv.h b/srsue/hdr/phy/phch_recv.h
index c51622e53..01a296094 100644
--- a/srsue/hdr/phy/phch_recv.h
+++ b/srsue/hdr/phy/phch_recv.h
@@ -158,7 +158,7 @@ private:
   uint32_t      current_earfcn;
 
   uint32_t      sync_sfn_cnt;
-  const static uint32_t SYNC_SFN_TIMEOUT = 200;
+  const static uint32_t SYNC_SFN_TIMEOUT = 1000;
   float ul_dl_factor;
   int cur_earfcn_index;
   bool cell_search_in_progress;
@@ -166,7 +166,7 @@ private:
   float    measure_rsrp;
   srslte_ue_dl_t ue_dl_measure;
 
-  const static int RSRP_MEASURE_NOF_FRAMES = 5;
+  const static int RSRP_MEASURE_NOF_FRAMES = 10;
 
   int    cell_sync_sfn();
   int    cell_meas_rsrp();
diff --git a/srsue/hdr/upper/rrc.h b/srsue/hdr/upper/rrc.h
index 3643f76c3..3e2fb70dd 100644
--- a/srsue/hdr/upper/rrc.h
+++ b/srsue/hdr/upper/rrc.h
@@ -98,6 +98,8 @@ private:
   uint8_t transaction_id;
   bool drb_up;
 
+  bool reestablishment_in_progress;
+
   // timeouts in ms
 
   uint32_t connecting_timeout;
@@ -244,7 +246,8 @@ private:
 
   // Helpers
   void          rrc_connection_release();
-  void          radio_link_failure(); 
+  void          con_restablish_cell_reselected();
+  void          radio_link_failure();
   static void*  start_sib_thread(void *rrc_);
   void          sib_search();
   void          apply_sib2_configs(LIBLTE_RRC_SYS_INFO_BLOCK_TYPE_2_STRUCT *sib2);
diff --git a/srsue/src/mac/mac.cc b/srsue/src/mac/mac.cc
index 1a2c909ae..f8d10bb34 100644
--- a/srsue/src/mac/mac.cc
+++ b/srsue/src/mac/mac.cc
@@ -117,7 +117,8 @@ void mac::reset()
 
   Info("Resetting MAC\n");
 
-  timers.stop_all();
+  timers.get(timer_alignment)->stop();
+  timers.get(contention_resolution_timer)->stop();
 
   ul_harq.reset_ndi();
 
diff --git a/srsue/src/phy/phch_common.cc b/srsue/src/phy/phch_common.cc
index 7d2948c1a..d956ddd15 100644
--- a/srsue/src/phy/phch_common.cc
+++ b/srsue/src/phy/phch_common.cc
@@ -336,6 +336,7 @@ void phch_common::reset_ul()
     pthread_mutex_trylock(&tx_mutex[i]);
     pthread_mutex_unlock(&tx_mutex[i]);
   }
+  radio_h->tx_end();
 }
 
 }
diff --git a/srsue/src/upper/rrc.cc b/srsue/src/upper/rrc.cc
index 94c2e449a..4ed20deeb 100644
--- a/srsue/src/upper/rrc.cc
+++ b/srsue/src/upper/rrc.cc
@@ -35,8 +35,6 @@
 #include "srslte/common/security.h"
 #include "srslte/common/bcd_helpers.h"
 
-#define TIMEOUT_RESYNC_REESTABLISH 100
-
 using namespace srslte;
 
 namespace srsue {
@@ -92,6 +90,8 @@ void rrc::init(phy_interface_rrc *phy_,
 
   pthread_mutex_init(&mutex, NULL);
 
+  reestablishment_in_progress = false;
+
   ue_category = SRSLTE_UE_CATEGORY;
   t301 = mac_timers->timer_get_unique_id();
   t310 = mac_timers->timer_get_unique_id();
@@ -207,7 +207,11 @@ void rrc::run_thread() {
         break;
       case RRC_STATE_CELL_SELECTED:
         rrc_log->info("RRC Cell Selected: Sending connection request...\n");
-        send_con_request();
+        if (reestablishment_in_progress) {
+          con_restablish_cell_reselected();
+        } else {
+          send_con_request();
+        }
         state = RRC_STATE_CONNECTING;
         connecting_timeout = 0;
         break;
@@ -226,6 +230,7 @@ void rrc::run_thread() {
         usleep(60000);
         rrc_log->info("Leaving RRC_CONNECTED state\n");
         drb_up = false;
+        reestablishment_in_progress = false;
         pdcp->reset();
         rlc->reset();
         phy->reset();
@@ -663,6 +668,8 @@ void rrc::send_con_restablish_request() {
   ul_ccch_msg.msg.rrc_con_reest_req.cause = LIBLTE_RRC_CON_REEST_REQ_CAUSE_OTHER_FAILURE;
   liblte_rrc_pack_ul_ccch_msg(&ul_ccch_msg, (LIBLTE_BIT_MSG_STRUCT *) &bit_buf);
 
+  reestablishment_in_progress = true;
+
   rrc_log->info("Initiating RRC Connection Reestablishment Procedure\n");
   rrc_log->console("RRC Connection Reestablishment\n");
   mac_timers->timer_get(t310)->stop();
@@ -673,19 +680,16 @@ void rrc::send_con_restablish_request() {
   set_phy_default();
   mac->reset();
   set_mac_default();
+}
 
-  // FIXME: Cell selection should be different??
-
-  // Wait for cell re-synchronization
-  uint32_t timeout_cnt = 0;
-  while (!phy->sync_status() && timeout_cnt < TIMEOUT_RESYNC_REESTABLISH) {
-    usleep(10000);
-    timeout_cnt++;
-  }
+// Actions following cell reselection 5.3.7.3
+void rrc::con_restablish_cell_reselected()
+{
+  reestablishment_in_progress = false;
+  rrc_log->info("Cell Selection finished. Initiating transmission of RRC Connection Reestablishment Request\n");
   mac_timers->timer_get(t301)->reset();
   mac_timers->timer_get(t301)->run();
   mac_timers->timer_get(t311)->stop();
-  rrc_log->info("Cell Selection finished. Initiating transmission of RRC Connection Reestablishment Request\n");
 
   // Byte align and pack the message bits for PDCP
   if ((bit_buf.N_bits % 8) != 0) {

From b8d5b5b6a92807ebee34ebb698c2f190a79b3efb Mon Sep 17 00:00:00 2001
From: Paul Sutton <suttonpd@gmail.com>
Date: Wed, 18 Oct 2017 21:54:52 +0100
Subject: [PATCH 38/55] count_dl fix

---
 srsue/src/upper/nas.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/srsue/src/upper/nas.cc b/srsue/src/upper/nas.cc
index f0fd8cf54..110c8c78e 100644
--- a/srsue/src/upper/nas.cc
+++ b/srsue/src/upper/nas.cc
@@ -281,7 +281,6 @@ void nas::parse_attach_accept(uint32_t lcid, byte_buffer_t *pdu) {
   LIBLTE_MME_ACTIVATE_DEFAULT_EPS_BEARER_CONTEXT_ACCEPT_MSG_STRUCT act_def_eps_bearer_context_accept;
 
   nas_log->info("Received Attach Accept\n");
-  count_dl++;
 
   liblte_mme_unpack_attach_accept_msg((LIBLTE_BYTE_MSG_STRUCT *) pdu, &attach_accept);
 
@@ -359,6 +358,8 @@ void nas::parse_attach_accept(uint32_t lcid, byte_buffer_t *pdu) {
     state = EMM_STATE_REGISTERED;
     current_plmn = selecting_plmn;
 
+    count_dl++;
+
     // Send EPS bearer context accept and attach complete
     count_ul++;
     act_def_eps_bearer_context_accept.eps_bearer_id = eps_bearer_id;
@@ -437,6 +438,9 @@ void nas::parse_authentication_request(uint32_t lcid, byte_buffer_t *pdu) {
     nas_log->console("Warning: Network authentication failure\n");
     pool->deallocate(pdu);
   }
+
+  // Reset DL counter (as per 24.301 5.4.3.2)
+  count_dl = 0;
 }
 
 void nas::parse_authentication_reject(uint32_t lcid, byte_buffer_t *pdu) {
@@ -539,6 +543,8 @@ void nas::parse_security_mode_command(uint32_t lcid, byte_buffer_t *pdu) {
     }
   }
 
+  count_dl++;
+
   if (!success) {
     // Reuse pdu for response
     pdu->reset();

From f1ec1b2f3c07bdf3bb0c09bb86e7996204742637 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Wed, 18 Oct 2017 19:33:43 -0400
Subject: [PATCH 39/55] disable attempt to resync after N310 (too soon if N310
 is low)

---
 srsue/src/upper/rrc.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/srsue/src/upper/rrc.cc b/srsue/src/upper/rrc.cc
index 4ed20deeb..10373dcf2 100644
--- a/srsue/src/upper/rrc.cc
+++ b/srsue/src/upper/rrc.cc
@@ -489,12 +489,12 @@ void rrc::earfcn_end() {
 
 // Detection of physical layer problems (5.3.11.1)
 void rrc::out_of_sync() {
-    current_cell->in_sync = false;
+  current_cell->in_sync = false;
   if (!mac_timers->timer_get(t311)->is_running() && !mac_timers->timer_get(t310)->is_running()) {
     n310_cnt++;
     if (n310_cnt == N310) {
       // attempt resync
-      phy->sync_reset();
+      //phy->sync_reset();
 
       mac_timers->timer_get(t310)->reset();
       mac_timers->timer_get(t310)->run();

From 3292f9c2694ce5b5a75501427b5a2fd09fc38189 Mon Sep 17 00:00:00 2001
From: yagoda <tallonj@tcd.ie>
Date: Thu, 19 Oct 2017 16:38:58 +0000
Subject: [PATCH 40/55] simd.h tidy up & small fix for eMBMS

---
 lib/include/srslte/phy/utils/simd.h | 2 --
 lib/src/phy/ue/ue_dl.c              | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h
index 0c378591e..09e9cff8e 100644
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@@ -1027,12 +1027,10 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s
   return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON
-  
  int* a_ptr = (int*) &a;
  int* b_ptr = (int*) &b;
  simd_i_t ret;
  int* sel = (int*) &selector;
- 
  int* c_ptr = (int*) &ret;
  for(int i = 0;i<4;i++)
  { 
diff --git a/lib/src/phy/ue/ue_dl.c b/lib/src/phy/ue/ue_dl.c
index c4e2d3f6c..c11a29b8b 100644
--- a/lib/src/phy/ue/ue_dl.c
+++ b/lib/src/phy/ue/ue_dl.c
@@ -603,7 +603,8 @@ int srslte_ue_dl_decode_mbsfn(srslte_ue_dl_t * q,
   grant.sf_type = SRSLTE_SF_MBSFN;
   grant.nof_tb = 1;
   grant.mcs[0].idx = 2;
- 
+  grant.tb_en[0] = true;
+  grant.tb_en[1] = false;
   grant.nof_prb = q->pmch.cell.nof_prb;
   srslte_dl_fill_ra_mcs(&grant.mcs[0], grant.nof_prb);
   srslte_softbuffer_rx_reset_tbs(q->softbuffers[0], (uint32_t) grant.mcs[0].tbs);

From 494802ba9b0299a5ddb02000240ea47236836ece Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 14:33:05 -0400
Subject: [PATCH 41/55] Fixed bug in RLC UM when TX large number of segments in
 a PDU

---
 lib/src/upper/rlc_um.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/src/upper/rlc_um.cc b/lib/src/upper/rlc_um.cc
index b2697178c..045fb0b7e 100644
--- a/lib/src/upper/rlc_um.cc
+++ b/lib/src/upper/rlc_um.cc
@@ -309,7 +309,7 @@ int  rlc_um::build_data_pdu(uint8_t *payload, uint32_t nof_bytes)
   }
 
   // Pull SDUs from queue
-  while(pdu_space > head_len && tx_sdu_queue.size() > 0)
+  while(pdu_space > head_len + 1 && tx_sdu_queue.size() > 0)
   {
     log->debug("pdu_space=%d, head_len=%d\n", pdu_space, head_len);
     if(last_li > 0)

From 399f1cdbd1352aad215f2baaf8386f335334e70b Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 14:47:14 -0400
Subject: [PATCH 42/55] Force retx for mcs>29 and new tb if rv=0 && mcs<29

---
 lib/include/srslte/common/common.h | 1 -
 srsue/hdr/mac/dl_harq.h            | 9 +++++----
 srsue/hdr/mac/mac.h                | 2 +-
 srsue/src/phy/phch_worker.cc       | 6 +-----
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/lib/include/srslte/common/common.h b/lib/include/srslte/common/common.h
index fbc703486..6372af73e 100644
--- a/lib/include/srslte/common/common.h
+++ b/lib/include/srslte/common/common.h
@@ -52,7 +52,6 @@
 #define TTIMOD_SZ       (((2*HARQ_DELAY_MS) < 10)?10:20)
 #define TTIMOD(tti)     (tti%TTIMOD_SZ)
 
-#define MOD_N_PROC      (2*HARQ_DELAY_MS-8)
 #define ASYNC_DL_SCHED  (HARQ_DELAY_MS <= 4)
 
 // Cat 3 UE - Max number of DL-SCH transport block bits received within a TTI
diff --git a/srsue/hdr/mac/dl_harq.h b/srsue/hdr/mac/dl_harq.h
index 278dbc6dc..9abcf0922 100644
--- a/srsue/hdr/mac/dl_harq.h
+++ b/srsue/hdr/mac/dl_harq.h
@@ -259,7 +259,7 @@ private:
         memcpy(&cur_grant, &grant, sizeof(Tgrant));
 
         // If data has not yet been successfully decoded
-        if (!ack) {
+        if (!ack || (grant.rv[tid]==0 && grant.phy_grant.dl.mcs[tid].idx < 29)) {
 
           // Instruct the PHY To combine the received data and attempt to decode it
           if (pid == HARQ_BCCH_PID) {
@@ -347,9 +347,10 @@ private:
       // Determine if it's a new transmission 5.3.2.2
       bool calc_is_new_transmission(Tgrant grant) {
 
-        if ((grant.ndi[tid] != cur_grant.ndi[tid])       || // 1st condition (NDI has changed)
-            (pid == HARQ_BCCH_PID && grant.rv[tid] == 0) || // 2nd condition (Broadcast and 1st transmission)
-             is_first_tb)                                   // 3rd condition (first TB)
+        if (grant.phy_grant.dl.mcs[tid].idx < 28 &&          // mcs 29,30,31 always retx regardless of rest
+            ((grant.ndi[tid] != cur_grant.ndi[tid])       || // 1st condition (NDI has changed)
+             (pid == HARQ_BCCH_PID && grant.rv[tid] == 0) || // 2nd condition (Broadcast and 1st transmission)
+             is_first_tb))
         {
           is_first_tb         = false;
           is_new_transmission = true;
diff --git a/srsue/hdr/mac/mac.h b/srsue/hdr/mac/mac.h
index 87bb764e1..d19f668bf 100644
--- a/srsue/hdr/mac/mac.h
+++ b/srsue/hdr/mac/mac.h
@@ -135,7 +135,7 @@ private:
   demux         demux_unit; 
   
   /* DL/UL HARQ */
-  dl_harq_entity<MAC_NOF_HARQ_PROC+MOD_N_PROC, mac_grant_t, tb_action_dl_t, srslte_phy_grant_t> dl_harq;
+  dl_harq_entity<MAC_NOF_HARQ_PROC, mac_grant_t, tb_action_dl_t, srslte_phy_grant_t> dl_harq;
   ul_harq_entity<MAC_NOF_HARQ_PROC, mac_grant_t, tb_action_ul_t, srslte_phy_grant_t> ul_harq;
   
   /* MAC Uplink-related Procedures */
diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc
index a32bb7537..1d86461cd 100644
--- a/srsue/src/phy/phch_worker.cc
+++ b/srsue/src/phy/phch_worker.cc
@@ -481,11 +481,7 @@ bool phch_worker::decode_pdcch_dl(srsue::mac_interface_phy::mac_grant_t* grant)
     /* Fill MAC grant structure */
     grant->ndi[0] = dci_unpacked.ndi;
     grant->ndi[1] = dci_unpacked.ndi_1;
-    if (tti < MOD_N_PROC) {
-      grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:tti+(2*HARQ_DELAY_MS);
-    } else {
-      grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
-    }
+    grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
     grant->n_bytes[0] = grant->phy_grant.dl.mcs[0].tbs / (uint32_t) 8;
     grant->n_bytes[1] = grant->phy_grant.dl.mcs[1].tbs / (uint32_t) 8;
     grant->tti = tti;

From 2f44e2bf3a4082211a4f57f3d99e778af103e8ec Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 16:10:27 -0400
Subject: [PATCH 43/55] Refactored cases mcs>29 for ul/dl

---
 lib/include/srslte/phy/common/phy_common.h |   3 +-
 lib/include/srslte/phy/phch/ra.h           |   3 +-
 lib/src/phy/modem/demod_hard.c             |   1 +
 lib/src/phy/modem/modem_table.c            |   1 +
 lib/src/phy/phch/dci.c                     |   4 +-
 lib/src/phy/phch/ra.c                      | 182 ++++++++-------------
 lib/src/phy/phch/test/pusch_test.c         |   2 +-
 srsenb/src/phy/phch_worker.cc              |   2 +-
 srsue/hdr/phy/phch_worker.h                |   7 +-
 srsue/src/phy/phch_worker.cc               |  47 ++++--
 10 files changed, 123 insertions(+), 129 deletions(-)

diff --git a/lib/include/srslte/phy/common/phy_common.h b/lib/include/srslte/phy/common/phy_common.h
index 148a12974..0b71ad82e 100644
--- a/lib/include/srslte/phy/common/phy_common.h
+++ b/lib/include/srslte/phy/common/phy_common.h
@@ -197,7 +197,8 @@ typedef enum SRSLTE_API {
   SRSLTE_MOD_BPSK = 0, 
   SRSLTE_MOD_QPSK, 
   SRSLTE_MOD_16QAM, 
-  SRSLTE_MOD_64QAM
+  SRSLTE_MOD_64QAM,
+  SRSLTE_MOD_LAST
 } srslte_mod_t;
 
 typedef struct SRSLTE_API {
diff --git a/lib/include/srslte/phy/phch/ra.h b/lib/include/srslte/phy/phch/ra.h
index 3039455ed..680d80dd6 100644
--- a/lib/include/srslte/phy/phch/ra.h
+++ b/lib/include/srslte/phy/phch/ra.h
@@ -226,8 +226,7 @@ SRSLTE_API uint32_t srslte_ra_dl_grant_nof_re(srslte_ra_dl_grant_t *grant,
 SRSLTE_API int srslte_ra_ul_dci_to_grant(srslte_ra_ul_dci_t *dci, 
                                          uint32_t nof_prb,
                                          uint32_t n_rb_ho, 
-                                         srslte_ra_ul_grant_t *grant, 
-                                         uint32_t harq_pid);
+                                         srslte_ra_ul_grant_t *grant);
 
 SRSLTE_API void srslte_ra_ul_grant_to_nbits(srslte_ra_ul_grant_t *grant, 
                                             srslte_cp_t cp,
diff --git a/lib/src/phy/modem/demod_hard.c b/lib/src/phy/modem/demod_hard.c
index 76f54236d..899559ecc 100644
--- a/lib/src/phy/modem/demod_hard.c
+++ b/lib/src/phy/modem/demod_hard.c
@@ -44,6 +44,7 @@ int srslte_demod_hard_demodulate(srslte_demod_hard_t* q, cf_t* symbols, uint8_t
 
   int nbits=-1;
   switch(q->mod) {
+  case SRSLTE_MOD_LAST:
   case SRSLTE_MOD_BPSK:
     hard_bpsk_demod(symbols,bits,nsymbols);
     nbits=nsymbols;
diff --git a/lib/src/phy/modem/modem_table.c b/lib/src/phy/modem/modem_table.c
index c19e52e77..3c4ad2417 100644
--- a/lib/src/phy/modem/modem_table.c
+++ b/lib/src/phy/modem/modem_table.c
@@ -82,6 +82,7 @@ int srslte_modem_table_set(srslte_modem_table_t* q, cf_t* table, uint32_t nsymbo
 int srslte_modem_table_lte(srslte_modem_table_t* q, srslte_mod_t modulation) {
   srslte_modem_table_init(q);
   switch(modulation) {
+  case SRSLTE_MOD_LAST:
   case SRSLTE_MOD_BPSK:
     q->nbits_x_symbol = 1;
     q->nsymbols = 2;
diff --git a/lib/src/phy/phch/dci.c b/lib/src/phy/phch/dci.c
index 2daa7e10d..471429145 100644
--- a/lib/src/phy/phch/dci.c
+++ b/lib/src/phy/phch/dci.c
@@ -111,7 +111,7 @@ int srslte_dci_rar_to_ul_grant(srslte_dci_rar_grant_t *rar, uint32_t nof_prb,
   srslte_ra_type2_from_riv(riv, &ul_dci->type2_alloc.L_crb, &ul_dci->type2_alloc.RB_start,
                            nof_prb, nof_prb);
   
-  if (srslte_ra_ul_dci_to_grant(ul_dci, nof_prb, n_rb_ho, grant, 0)) {
+  if (srslte_ra_ul_dci_to_grant(ul_dci, nof_prb, n_rb_ho, grant)) {
     return SRSLTE_ERROR;
   }
   
@@ -177,7 +177,7 @@ int srslte_dci_msg_to_ul_grant(srslte_dci_msg_t *msg, uint32_t nof_prb,
       return ret;
     } 
 
-    if (srslte_ra_ul_dci_to_grant(ul_dci, nof_prb, n_rb_ho, grant, harq_pid)) {
+    if (srslte_ra_ul_dci_to_grant(ul_dci, nof_prb, n_rb_ho, grant)) {
       return ret;
     }
     
diff --git a/lib/src/phy/phch/ra.c b/lib/src/phy/phch/ra.c
index be10c304c..8cae9fe65 100644
--- a/lib/src/phy/phch/ra.c
+++ b/lib/src/phy/phch/ra.c
@@ -185,108 +185,87 @@ int srslte_ra_ul_dci_to_grant_prb_allocation(srslte_ra_ul_dci_t *dci, srslte_ra_
   }
 }
 
-srslte_mod_t last_mod[8];
-uint32_t     last_ul_tbs_idx[8];
-uint32_t     last_dl_tbs[8];
-uint32_t     last_dl_tbs2[8];
-
-static int ul_dci_to_grant_mcs(srslte_ra_ul_dci_t *dci, srslte_ra_ul_grant_t *grant, uint32_t harq_pid) {  
-  int tbs = -1; 
+static void ul_dci_to_grant_mcs(srslte_ra_ul_dci_t *dci, srslte_ra_ul_grant_t *grant) {
   // 8.6.2 First paragraph
   if (dci->mcs_idx <= 28) {
     /* Table 8.6.1-1 on 36.213 */
     if (dci->mcs_idx < 11) {
       grant->mcs.mod = SRSLTE_MOD_QPSK;
-      tbs = srslte_ra_tbs_from_idx(dci->mcs_idx, grant->L_prb);      
-      last_ul_tbs_idx[harq_pid%8] = dci->mcs_idx;
+      grant->mcs.tbs = srslte_ra_tbs_from_idx(dci->mcs_idx, grant->L_prb);
     } else if (dci->mcs_idx < 21) {
       grant->mcs.mod = SRSLTE_MOD_16QAM;
-      tbs = srslte_ra_tbs_from_idx(dci->mcs_idx-1, grant->L_prb);
-      last_ul_tbs_idx[harq_pid%8] = dci->mcs_idx-1;
+      grant->mcs.tbs = srslte_ra_tbs_from_idx(dci->mcs_idx-1, grant->L_prb);
     } else if (dci->mcs_idx < 29) {
       grant->mcs.mod = SRSLTE_MOD_64QAM;
-      tbs = srslte_ra_tbs_from_idx(dci->mcs_idx-2, grant->L_prb);
-      last_ul_tbs_idx[harq_pid%8] = dci->mcs_idx-2;      
+      grant->mcs.tbs = srslte_ra_tbs_from_idx(dci->mcs_idx-2, grant->L_prb);
     } else {
       fprintf(stderr, "Invalid MCS index %d\n", dci->mcs_idx);
     }
-    last_mod[harq_pid%8] = grant->mcs.mod;
   } else if (dci->mcs_idx == 29 && dci->cqi_request && grant->L_prb <= 4) {
     // 8.6.1 and 8.6.2 36.213 second paragraph
     grant->mcs.mod = SRSLTE_MOD_QPSK;
-    tbs = srslte_ra_tbs_from_idx(last_ul_tbs_idx[harq_pid%8], grant->L_prb); 
-    dci->rv_idx = 1; 
+    grant->mcs.tbs = 0;
+    dci->rv_idx = 1;
   } else if (dci->mcs_idx >= 29) {
-    // Else use last TBS/Modulation and use mcs to obtain rv_idx 
-    tbs = srslte_ra_tbs_from_idx(last_ul_tbs_idx[harq_pid%8], grant->L_prb); 
-    grant->mcs.mod = last_mod[harq_pid%8]; 
+    // Else use last TBS/Modulation and use mcs to obtain rv_idx
+    grant->mcs.tbs = -1;
+    grant->mcs.mod = SRSLTE_MOD_LAST;
     dci->rv_idx = dci->mcs_idx - 28;
-    DEBUG("TTI=%d, harq_pid=%d, mcs_idx=%d, tbs=%d, mod=%d, rv=%d\n", 
-           harq_pid, harq_pid%8, dci->mcs_idx, tbs/8, grant->mcs.mod, dci->rv_idx);
-  }
-  if (tbs < 0) {
-    fprintf(stderr, "Error computing TBS\n");
-    return SRSLTE_ERROR; 
-  } else {
-    grant->mcs.tbs = (uint32_t) tbs; 
-    return SRSLTE_SUCCESS;
+    DEBUG("mcs_idx=%d, tbs=%d, mod=%d, rv=%d\n",
+           dci->mcs_idx, grant->mcs.tbs/8, grant->mcs.mod, dci->rv_idx);
   }
 }
 
-void srslte_ra_ul_grant_to_nbits(srslte_ra_ul_grant_t *grant, srslte_cp_t cp, uint32_t N_srs, srslte_ra_nbits_t *nbits) 
+void srslte_ra_ul_grant_to_nbits(srslte_ra_ul_grant_t *grant, srslte_cp_t cp, uint32_t N_srs, srslte_ra_nbits_t *nbits)
 {
-  nbits->nof_symb = 2*(SRSLTE_CP_NSYMB(cp)-1) - N_srs; 
+  nbits->nof_symb = 2*(SRSLTE_CP_NSYMB(cp)-1) - N_srs;
   nbits->nof_re   = nbits->nof_symb*grant->M_sc;
   nbits->nof_bits = nbits->nof_re * grant->Qm;
 }
 
 /** Compute PRB allocation for Uplink as defined in 8.1 and 8.4 of 36.213 */
-int srslte_ra_ul_dci_to_grant(srslte_ra_ul_dci_t *dci, uint32_t nof_prb, uint32_t n_rb_ho, srslte_ra_ul_grant_t *grant, 
-                              uint32_t harq_pid) 
+int srslte_ra_ul_dci_to_grant(srslte_ra_ul_dci_t *dci, uint32_t nof_prb, uint32_t n_rb_ho, srslte_ra_ul_grant_t *grant)
 {
-  
-  // Compute PRB allocation 
+
+  // Compute PRB allocation
   if (!srslte_ra_ul_dci_to_grant_prb_allocation(dci, grant, n_rb_ho, nof_prb)) {
-    
-    // Compute MCS 
-    if (!ul_dci_to_grant_mcs(dci, grant, harq_pid)) {
-      
-      // Fill rest of grant structure 
-      grant->mcs.idx = dci->mcs_idx;
-      grant->M_sc = grant->L_prb*SRSLTE_NRE;
-      grant->M_sc_init = grant->M_sc; // FIXME: What should M_sc_init be? 
-      grant->Qm = srslte_mod_bits_x_symbol(grant->mcs.mod);
-    } else {
-      fprintf(stderr, "Error computing MCS\n");
-      return SRSLTE_ERROR; 
-    }
+
+    // Compute MCS
+    ul_dci_to_grant_mcs(dci, grant);
+
+    // Fill rest of grant structure
+    grant->mcs.idx = dci->mcs_idx;
+    grant->M_sc = grant->L_prb*SRSLTE_NRE;
+    grant->M_sc_init = grant->M_sc; // FIXME: What should M_sc_init be?
+    grant->Qm = srslte_mod_bits_x_symbol(grant->mcs.mod);
+
   } else {
     printf("Error computing UL PRB allocation\n");
-    return SRSLTE_ERROR; 
+    return SRSLTE_ERROR;
   }
   return SRSLTE_SUCCESS;
 }
 
-uint32_t srslte_ra_dl_approx_nof_re(srslte_cell_t cell, uint32_t nof_prb, uint32_t nof_ctrl_symbols) 
+uint32_t srslte_ra_dl_approx_nof_re(srslte_cell_t cell, uint32_t nof_prb, uint32_t nof_ctrl_symbols)
 {
-  uint32_t nof_refs = 0; 
+  uint32_t nof_refs = 0;
   uint32_t nof_symb     = 2*SRSLTE_CP_NSYMB(cell.cp)-nof_ctrl_symbols;
   switch(cell.nof_ports) {
-    case 1: 
-      nof_refs = 2*3; 
-      break; 
-    case 2: 
-      nof_refs = 4*3; 
-      break; 
-    case 4: 
-      nof_refs = 4*4; 
-      break; 
+    case 1:
+      nof_refs = 2*3;
+      break;
+    case 2:
+      nof_refs = 4*3;
+      break;
+    case 4:
+      nof_refs = 4*4;
+      break;
   }
   return nof_prb * (nof_symb*SRSLTE_NRE-nof_refs);
 }
 
 /* Computes the number of RE for each PRB in the prb_dist structure */
-uint32_t srslte_ra_dl_grant_nof_re(srslte_ra_dl_grant_t *grant, srslte_cell_t cell, 
+uint32_t srslte_ra_dl_grant_nof_re(srslte_ra_dl_grant_t *grant, srslte_cell_t cell,
                                       uint32_t sf_idx, uint32_t nof_ctrl_symbols)
 {
   uint32_t j, s;
@@ -300,7 +279,7 @@ uint32_t srslte_ra_dl_grant_nof_re(srslte_ra_dl_grant_t *grant, srslte_cell_t ce
       }
     }
   }
-  return nof_re; 
+  return nof_re;
 }
 
 
@@ -315,7 +294,7 @@ int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_
   uint32_t bitmask;
   uint32_t P = srslte_ra_type0_P(nof_prb);
   uint32_t n_rb_rbg_subset, n_rb_type1;
-  
+
   bzero(grant, sizeof(srslte_ra_dl_grant_t));
   switch (dci->alloc_type) {
   case SRSLTE_RA_ALLOC_TYPE0:
@@ -352,14 +331,14 @@ int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_
             * P * P + dci->type1_alloc.rbg_subset * P + (i + shift) % P] = true;
           grant->nof_prb++;
         } else {
-          return SRSLTE_ERROR; 
+          return SRSLTE_ERROR;
         }
       }
     }
     memcpy(&grant->prb_idx[1], &grant->prb_idx[0], SRSLTE_MAX_PRB*sizeof(bool));
     break;
   case SRSLTE_RA_ALLOC_TYPE2:
-    if (dci->type2_alloc.mode == SRSLTE_RA_TYPE2_LOC) {      
+    if (dci->type2_alloc.mode == SRSLTE_RA_TYPE2_LOC) {
       for (i = 0; i < dci->type2_alloc.L_crb; i++) {
         grant->prb_idx[0][i + dci->type2_alloc.RB_start] = true;
         grant->nof_prb++;
@@ -408,13 +387,13 @@ int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_
           if (n_tilde_prb_odd < nof_prb) {
             grant->prb_idx[0][n_tilde_prb_odd] = true;
           } else {
-            return SRSLTE_ERROR; 
+            return SRSLTE_ERROR;
           }
         } else {
           if (n_tilde_prb_odd + N_gap - N_tilde_vrb / 2 < nof_prb) {
             grant->prb_idx[0][n_tilde_prb_odd + N_gap - N_tilde_vrb / 2] = true;
           } else {
-            return SRSLTE_ERROR; 
+            return SRSLTE_ERROR;
           }
         }
         grant->nof_prb++;
@@ -422,13 +401,13 @@ int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_
           if(n_tilde_prb_even < nof_prb) {
             grant->prb_idx[1][n_tilde_prb_even] = true;
           } else {
-            return SRSLTE_ERROR; 
+            return SRSLTE_ERROR;
           }
         } else {
           if (n_tilde_prb_even + N_gap - N_tilde_vrb / 2 < nof_prb) {
             grant->prb_idx[1][n_tilde_prb_even + N_gap - N_tilde_vrb / 2] = true;
           } else {
-            return SRSLTE_ERROR; 
+            return SRSLTE_ERROR;
           }
         }
       }
@@ -442,8 +421,7 @@ int srslte_ra_dl_dci_to_grant_prb_allocation(srslte_ra_dl_dci_t *dci, srslte_ra_
 }
 
 int srslte_dl_fill_ra_mcs(srslte_ra_mcs_t *mcs, uint32_t nprb) {
-  uint32_t i_tbs = 0; 
-  int tbs = -1; 
+  int i_tbs = 0;
   if (mcs->idx < 10) {
     mcs->mod = SRSLTE_MOD_QPSK;
     i_tbs = mcs->idx;
@@ -455,30 +433,26 @@ int srslte_dl_fill_ra_mcs(srslte_ra_mcs_t *mcs, uint32_t nprb) {
     i_tbs = mcs->idx-2;
   } else if (mcs->idx == 29) {
     mcs->mod = SRSLTE_MOD_QPSK;
-    tbs = 0;
-    i_tbs = 0;
+    i_tbs = -1;
   } else if (mcs->idx == 30) {
     mcs->mod = SRSLTE_MOD_16QAM;
-    tbs = 0;
-    i_tbs = 0;
+    i_tbs = -1;
   } else if (mcs->idx == 31) {
     mcs->mod = SRSLTE_MOD_64QAM;
-    tbs = 0;
-    i_tbs = 0;
+    i_tbs = -1;
   }
-  
-  if (tbs == -1) {
+
+  int tbs = -1;
+  if (i_tbs >= 0) {
     tbs = srslte_ra_tbs_from_idx(i_tbs, nprb);
-    if (tbs >= 0) {
-      mcs->tbs = tbs; 
-    }
-  }  
-  return tbs; 
+    mcs->tbs = tbs;
+  }
+  return tbs;
 }
 
 int srslte_dl_fill_ra_mcs_pmch(srslte_ra_mcs_t *mcs, uint32_t nprb) {
-  uint32_t i_tbs = 0; 
-  int tbs = -1; 
+  uint32_t i_tbs = 0;
+  int tbs = -1;
   if (mcs->idx < 5) {
     mcs->mod = SRSLTE_MOD_QPSK;
     i_tbs = mcs->idx*2;
@@ -492,7 +466,7 @@ int srslte_dl_fill_ra_mcs_pmch(srslte_ra_mcs_t *mcs, uint32_t nprb) {
     mcs->mod = SRSLTE_MOD_64QAM;
     i_tbs = mcs->idx + 5;
   }else if (mcs->idx < 28) {
-    //mcs->mod = SRSLTE_MOD_256QAM; 
+    //mcs->mod = SRSLTE_MOD_256QAM;
     i_tbs = mcs->idx + 5;
   }else if (mcs->idx == 28) {
     mcs->mod = SRSLTE_MOD_QPSK;
@@ -511,15 +485,15 @@ int srslte_dl_fill_ra_mcs_pmch(srslte_ra_mcs_t *mcs, uint32_t nprb) {
     tbs = 0;
     i_tbs = 0;
   }
-  
-  
+
+
   if (tbs == -1) {
     tbs = srslte_ra_tbs_from_idx(i_tbs, nprb);
     if (tbs >= 0) {
-      mcs->tbs = tbs; 
+      mcs->tbs = tbs;
     }
-  }  
-  return tbs; 
+  }
+  return tbs;
 }
 
 /* Modulation order and transport block size determination 7.1.7 in 36.213
@@ -530,9 +504,9 @@ int srslte_dl_fill_ra_mcs_pmch(srslte_ra_mcs_t *mcs, uint32_t nprb) {
  * */
 static int dl_dci_to_grant_mcs(srslte_ra_dl_dci_t *dci, srslte_ra_dl_grant_t *grant, bool crc_is_crnti) {
   uint32_t n_prb=0;
-  int tbs = -1; 
-  uint32_t i_tbs = 0; 
-  
+  int tbs = -1;
+  uint32_t i_tbs = 0;
+
   if (!crc_is_crnti) {
     if (dci->dci_is_1a) {
       n_prb = dci->type2_alloc.n_prb1a == SRSLTE_RA_TYPE2_NPRB1A_2 ? 2 : 3;
@@ -546,35 +520,23 @@ static int dl_dci_to_grant_mcs(srslte_ra_dl_dci_t *dci, srslte_ra_dl_grant_t *gr
       }
     } else {
       fprintf(stderr, "Error decoding DCI: P/SI/RA-RNTI supports Format1A/1C only\n");
-      return SRSLTE_ERROR; 
+      return SRSLTE_ERROR;
     }
     grant->mcs[0].mod = SRSLTE_MOD_QPSK;
     grant->mcs[0].tbs = (uint32_t) tbs;
   } else {
     n_prb = grant->nof_prb;
-    grant->nof_tb = 0; 
+    grant->nof_tb = 0;
     if (dci->tb_en[0]) {
       grant->mcs[0].idx = dci->mcs_idx;
-      tbs   = srslte_dl_fill_ra_mcs(&grant->mcs[0], n_prb);
-      if (tbs) {
-        last_dl_tbs[dci->harq_process%8] = tbs;
-      } else {
-        // For mcs>=29, set last TBS received for this PID
-        grant->mcs[0].tbs = last_dl_tbs[dci->harq_process%8];
-      }
+      grant->mcs[0].tbs = srslte_dl_fill_ra_mcs(&grant->mcs[0], n_prb);
       grant->nof_tb++;
     } else {
       grant->mcs[0].tbs = 0;
     }
     if (dci->tb_en[1]) {
       grant->mcs[1].idx = dci->mcs_idx_1;
-      tbs = srslte_dl_fill_ra_mcs(&grant->mcs[1], n_prb);
-      if (tbs) {
-        last_dl_tbs2[dci->harq_process%8] = tbs;
-      } else {
-        // For mcs>=29, set last TBS received for this PID
-        grant->mcs[1].tbs = last_dl_tbs2[dci->harq_process%8];
-      }
+      grant->mcs[1].tbs = srslte_dl_fill_ra_mcs(&grant->mcs[1], n_prb);
     } else {
       grant->mcs[1].tbs = 0;
     }
diff --git a/lib/src/phy/phch/test/pusch_test.c b/lib/src/phy/phch/test/pusch_test.c
index 9d048e680..cf0be75c3 100644
--- a/lib/src/phy/phch/test/pusch_test.c
+++ b/lib/src/phy/phch/test/pusch_test.c
@@ -136,7 +136,7 @@ int main(int argc, char **argv) {
   dci.mcs_idx = mcs_idx;
   
   srslte_ra_ul_grant_t grant; 
-  if (srslte_ra_ul_dci_to_grant(&dci, cell.nof_prb, 0, &grant, 0)) {
+  if (srslte_ra_ul_dci_to_grant(&dci, cell.nof_prb, 0, &grant)) {
     fprintf(stderr, "Error computing resource allocation\n");
     return ret;
   }
diff --git a/srsenb/src/phy/phch_worker.cc b/srsenb/src/phy/phch_worker.cc
index 52b617fd5..b80467f4d 100644
--- a/srsenb/src/phy/phch_worker.cc
+++ b/srsenb/src/phy/phch_worker.cc
@@ -408,7 +408,7 @@ int phch_worker::decode_pusch(srslte_enb_ul_pusch_t *grants, uint32_t nof_pusch)
 
       srslte_ra_ul_grant_t phy_grant;
       int res = -1;
-      if (!srslte_ra_ul_dci_to_grant(&grants[i].grant, enb_ul.cell.nof_prb, n_rb_ho, &phy_grant, tti_rx%8)) {
+      if (!srslte_ra_ul_dci_to_grant(&grants[i].grant, enb_ul.cell.nof_prb, n_rb_ho, &phy_grant)) {
         if (phy_grant.mcs.mod == SRSLTE_MOD_64QAM) {
           phy_grant.mcs.mod = SRSLTE_MOD_16QAM;
         }
diff --git a/srsue/hdr/phy/phch_worker.h b/srsue/hdr/phy/phch_worker.h
index 0811723e0..c26966c17 100644
--- a/srsue/hdr/phy/phch_worker.h
+++ b/srsue/hdr/phy/phch_worker.h
@@ -149,7 +149,12 @@ private:
   uint32_t                          I_sr; 
   float                             cfo;
   bool                              rar_cqi_request;
-    
+
+  // Save last TBS for mcs>28 cases
+  int last_dl_tbs[2*HARQ_DELAY_MS][SRSLTE_MAX_CODEWORDS];
+  int last_ul_tbs[2*HARQ_DELAY_MS];
+  srslte_mod_t last_ul_mod[2*HARQ_DELAY_MS];
+
   // Metrics
   dl_metrics_t dl_metrics;
   ul_metrics_t ul_metrics;
diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc
index 1d86461cd..9fa3ff669 100644
--- a/srsue/src/phy/phch_worker.cc
+++ b/srsue/src/phy/phch_worker.cc
@@ -478,10 +478,20 @@ bool phch_worker::decode_pdcch_dl(srsue::mac_interface_phy::mac_grant_t* grant)
       return false;   
     }
 
+    grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
+
+    // Set last TBS for this TB (pid) in case of mcs>29 (7.1.7.2 of 36.213)
+    for (int i=0;i<SRSLTE_MAX_CODEWORDS;i++) {
+      if (grant->phy_grant.dl.mcs[i].tbs < 0) {
+        grant->phy_grant.dl.mcs[i].tbs = last_dl_tbs[grant->pid%(2*HARQ_DELAY_MS)][i];
+      }
+      // save it
+      last_dl_tbs[grant->pid%(2*HARQ_DELAY_MS)][i] = grant->phy_grant.dl.mcs[i].tbs;
+    }
+
     /* Fill MAC grant structure */
     grant->ndi[0] = dci_unpacked.ndi;
     grant->ndi[1] = dci_unpacked.ndi_1;
-    grant->pid = ASYNC_DL_SCHED?dci_unpacked.harq_process:(tti%(2*HARQ_DELAY_MS));
     grant->n_bytes[0] = grant->phy_grant.dl.mcs[0].tbs / (uint32_t) 8;
     grant->n_bytes[1] = grant->phy_grant.dl.mcs[1].tbs / (uint32_t) 8;
     grant->tti = tti;
@@ -718,14 +728,29 @@ bool phch_worker::decode_pdcch_ul(mac_interface_phy::mac_grant_t* grant)
            ue_dl.last_location_ul.ncce, (1<<ue_dl.last_location_ul.L), dci_msg.nof_bits, hexstr);
       
       if (grant->phy_grant.ul.mcs.tbs==0) {
-        srslte_vec_fprint_hex(stdout, dci_msg.data, dci_msg.nof_bits);
+        Info("Received PUSCH grant with empty data\n");
       }
     }
   }
-  
+
+  if (ret) {
+
+    // Use last TBS for this TB in case of mcs>28
+    if (grant->phy_grant.ul.mcs.tbs < 0) {
+      grant->phy_grant.ul.mcs.tbs = last_ul_tbs[tti%(2*HARQ_DELAY_MS)];
+    }
+    last_ul_tbs[tti%(2*HARQ_DELAY_MS)] = grant->phy_grant.ul.mcs.tbs;
+
+    if (grant->phy_grant.ul.mcs.mod == SRSLTE_MOD_LAST) {
+      grant->phy_grant.ul.mcs.mod = last_ul_mod[tti%(2*HARQ_DELAY_MS)];
+      grant->phy_grant.ul.Qm      = srslte_mod_bits_x_symbol(grant->phy_grant.ul.mcs.mod);
+    }
+    last_ul_mod[tti%(2*HARQ_DELAY_MS)] = grant->phy_grant.ul.mcs.mod;
+  }
+
   /* Limit UL modulation if not supported by the UE or disabled by higher layers */
   if (!phy->config->enable_64qam) {
-    if (grant->phy_grant.ul.mcs.mod == SRSLTE_MOD_64QAM) {
+    if (grant->phy_grant.ul.mcs.mod >= SRSLTE_MOD_64QAM) {
       grant->phy_grant.ul.mcs.mod = SRSLTE_MOD_16QAM;
       grant->phy_grant.ul.Qm      = 4;
     }
@@ -898,7 +923,7 @@ void phch_worker::encode_pusch(srslte_ra_ul_grant_t *grant, uint8_t *payload, ui
   if (srslte_ue_ul_cfg_grant(&ue_ul, grant, TTI_TX(tti), rv, current_tx_nb)) {
     Error("Configuring UL grant\n");
   }
-  
+
   if (srslte_ue_ul_pusch_encode_rnti_softbuffer(&ue_ul, 
                                                 payload, uci_data, 
                                                 softbuffer,
@@ -925,12 +950,12 @@ void phch_worker::encode_pusch(srslte_ra_ul_grant_t *grant, uint8_t *payload, ui
 #endif
 
   Info("PUSCH: tti_tx=%d, alloc=(%d,%d), tbs=%d, mcs=%d, rv=%d, ack=%s, ri=%s, cfo=%.1f KHz%s\n",
-         (tti+4)%10240,
-         grant->n_prb[0], grant->n_prb[0]+grant->L_prb,
-         grant->mcs.tbs/8, grant->mcs.idx, rv,
-         uci_data.uci_ack_len>0?(uci_data.uci_ack?"1":"0"):"no",
-         uci_data.uci_ri_len>0?(uci_data.uci_ri?"1":"0"):"no",
-         cfo*15, timestr);
+       (tti+HARQ_DELAY_MS)%10240,
+       grant->n_prb[0], grant->n_prb[0]+grant->L_prb,
+       grant->mcs.tbs/8, grant->mcs.idx, rv,
+       uci_data.uci_ack_len>0?(uci_data.uci_ack?"1":"0"):"no",
+       uci_data.uci_ri_len>0?(uci_data.uci_ri?"1":"0"):"no",
+       cfo*15, timestr);
 
   // Store metrics
   ul_metrics.mcs   = grant->mcs.idx;

From 9dbbe9731a472d54e2a2a7f47186a04f5f02bb5a Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 16:18:40 -0400
Subject: [PATCH 44/55] missing netmaks string after merge

---
 srsue/hdr/ue_base.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/srsue/hdr/ue_base.h b/srsue/hdr/ue_base.h
index 411896f70..634219083 100644
--- a/srsue/hdr/ue_base.h
+++ b/srsue/hdr/ue_base.h
@@ -109,6 +109,7 @@ typedef struct {
   std::string ue_cateogry;
   bool metrics_csv_enable;
   std::string metrics_csv_filename;
+  std::string ip_netmask;
 }expert_args_t;
 
 typedef struct {

From 05d6a1c82927859f183230243dbf1fd8f6c245ad Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 16:26:00 -0400
Subject: [PATCH 45/55] added option to configure netmask (rules out previous
 commit)

---
 srsue/hdr/ue_base.h   |  2 +-
 srsue/hdr/upper/gw.h  |  5 +++++
 srsue/src/ue.cc       |  2 ++
 srsue/src/upper/gw.cc | 13 ++++++++++++-
 srsue/ue.conf.example |  4 +++-
 5 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/srsue/hdr/ue_base.h b/srsue/hdr/ue_base.h
index 634219083..b547945a9 100644
--- a/srsue/hdr/ue_base.h
+++ b/srsue/hdr/ue_base.h
@@ -103,13 +103,13 @@ typedef struct {
 }gui_args_t;
 
 typedef struct {
+  std::string   ip_netmask;
   phy_args_t phy;
   float      metrics_period_secs;
   bool pregenerate_signals;
   std::string ue_cateogry;
   bool metrics_csv_enable;
   std::string metrics_csv_filename;
-  std::string ip_netmask;
 }expert_args_t;
 
 typedef struct {
diff --git a/srsue/hdr/upper/gw.h b/srsue/hdr/upper/gw.h
index 800b31624..b97ceb6c5 100644
--- a/srsue/hdr/upper/gw.h
+++ b/srsue/hdr/upper/gw.h
@@ -57,8 +57,13 @@ public:
   // NAS interface
   srslte::error_t setup_if_addr(uint32_t ip_addr, char *err_str);
 
+  void set_netmask(std::string netmask);
+
 private:
 
+  bool default_netmask;
+  std::string netmask;
+
   static const int GW_THREAD_PRIO = 7;
 
   pdcp_interface_gw  *pdcp;
diff --git a/srsue/src/ue.cc b/srsue/src/ue.cc
index 3560a78fe..a99281e6f 100644
--- a/srsue/src/ue.cc
+++ b/srsue/src/ue.cc
@@ -185,6 +185,8 @@ bool ue::init(all_args_t *args_)
   nas.init(&usim, &rrc, &gw, &nas_log, 1 /* RB_ID_SRB1 */);
   gw.init(&pdcp, &nas, &gw_log, 3 /* RB_ID_DRB1 */);
 
+  gw.set_netmask(args->expert.ip_netmask);
+
   rrc.init(&phy, &mac, &rlc, &pdcp, &nas, &usim, &mac, &rrc_log);
   rrc.set_ue_category(atoi(args->expert.ue_cateogry.c_str()));
 
diff --git a/srsue/src/upper/gw.cc b/srsue/src/upper/gw.cc
index 07ac36989..1e3e81999 100644
--- a/srsue/src/upper/gw.cc
+++ b/srsue/src/upper/gw.cc
@@ -44,6 +44,7 @@ gw::gw()
   :if_up(false)
 {
   current_ip_addr = 0;
+  default_netmask = true;
 }
 
 void gw::init(pdcp_interface_gw *pdcp_, nas_interface_gw *nas_, srslte::log *gw_log_, uint32_t lcid_)
@@ -104,6 +105,12 @@ void gw::get_metrics(gw_metrics_t &m)
   ul_tput_bytes = 0;
 }
 
+void gw::set_netmask(std::string netmask) {
+  default_netmask = false;
+  this->netmask = netmask;
+}
+
+
 /*******************************************************************************
   PDCP interface
 *******************************************************************************/
@@ -152,7 +159,11 @@ srslte::error_t gw::setup_if_addr(uint32_t ip_addr, char *err_str)
       return(srslte::ERROR_CANT_START);
     }
     ifr.ifr_netmask.sa_family                                 = AF_INET;
-    ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr = inet_addr("255.255.255.0");
+    const char *mask = "255.255.255.0";
+    if (!default_netmask) {
+      mask = netmask.c_str();
+    }
+    ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr = inet_addr(mask);
     if(0 > ioctl(sock, SIOCSIFNETMASK, &ifr))
     {
       err_str = strerror(errno);
diff --git a/srsue/ue.conf.example b/srsue/ue.conf.example
index 5d14d3c3c..d0dc67952 100644
--- a/srsue/ue.conf.example
+++ b/srsue/ue.conf.example
@@ -98,7 +98,8 @@ enable = false
 #####################################################################
 # Expert configuration options
 #
-# ue_category:          Sets UE category (range 1-5). Default: 4 
+# ip_netmask:           Netmask of the tun_srsue device. Default: 255.255.255.0
+# ue_category:          Sets UE category (range 1-5). Default: 4
 #
 # prach_gain:           PRACH gain (dB). If defined, forces a gain for the tranmsission of PRACH only., 
 #                       Default is to use tx_gain in [rf] section. 
@@ -138,6 +139,7 @@ enable = false
 #
 #####################################################################
 [expert]
+#ip_netmask          = 255.255.255.0
 #ue_category         = 4
 #prach_gain          = 30
 #cqi_max             = 15

From 49a105baed23e14fcac95e23a005837477b6b41c Mon Sep 17 00:00:00 2001
From: Paul Sutton <suttonpd@gmail.com>
Date: Thu, 19 Oct 2017 22:21:18 +0100
Subject: [PATCH 46/55] Fix for tests, minor fix for RLC UM

---
 lib/src/upper/rlc_um.cc       | 2 +-
 lib/test/upper/rlc_um_test.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/src/upper/rlc_um.cc b/lib/src/upper/rlc_um.cc
index 045fb0b7e..d95e186e7 100644
--- a/lib/src/upper/rlc_um.cc
+++ b/lib/src/upper/rlc_um.cc
@@ -277,7 +277,7 @@ int  rlc_um::build_data_pdu(uint8_t *payload, uint32_t nof_bytes)
   int head_len  = rlc_um_packed_length(&header);
   int pdu_space = nof_bytes;
 
-  if(pdu_space <= head_len)
+  if(pdu_space <= head_len + 1)
   {
     log->warning("%s Cannot build a PDU - %d bytes available, %d bytes required for header\n",
                  rrc->get_rb_name(lcid).c_str(), nof_bytes, head_len);
diff --git a/lib/test/upper/rlc_um_test.cc b/lib/test/upper/rlc_um_test.cc
index 0894a2a8b..8abcfae3c 100644
--- a/lib/test/upper/rlc_um_test.cc
+++ b/lib/test/upper/rlc_um_test.cc
@@ -123,7 +123,7 @@ void basic_test()
   byte_buffer_t pdu_bufs[NBUFS];
   for(int i=0;i<NBUFS;i++)
   {
-    len = rlc1.read_pdu(pdu_bufs[i].msg, 3); // 3 bytes for header + payload
+    len = rlc1.read_pdu(pdu_bufs[i].msg, 4); // 3 bytes for header + payload
     pdu_bufs[i].N_bytes = len;
   }
 
@@ -191,7 +191,7 @@ void loss_test()
   byte_buffer_t pdu_bufs[NBUFS];
   for(int i=0;i<NBUFS;i++)
   {
-    len = rlc1.read_pdu(pdu_bufs[i].msg, 3); // 3 bytes for header + payload
+    len = rlc1.read_pdu(pdu_bufs[i].msg, 4); // 3 bytes for header + payload
     pdu_bufs[i].N_bytes = len;
   }
 

From dfef7c1a2565e41808d780f104112b771cf7d946 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 18:10:17 -0400
Subject: [PATCH 47/55] coverty uninitialized member

---
 srsue/src/mac/demux.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srsue/src/mac/demux.cc b/srsue/src/mac/demux.cc
index 14c524165..171e7bf45 100644
--- a/srsue/src/mac/demux.cc
+++ b/srsue/src/mac/demux.cc
@@ -36,7 +36,7 @@
 
 namespace srsue {
     
-demux::demux() : mac_msg(20), pending_mac_msg(20)
+demux::demux() : mac_msg(20), pending_mac_msg(20), rlc(NULL)
 {
 }
 

From c14393b24f874b4e12d3bbfb883133fedda4f6f1 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Thu, 19 Oct 2017 19:39:54 -0400
Subject: [PATCH 48/55] Disable RSSI sensor by default

---
 srsue/src/main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srsue/src/main.cc b/srsue/src/main.cc
index 3b0cad2d0..1b146a9f3 100644
--- a/srsue/src/main.cc
+++ b/srsue/src/main.cc
@@ -155,7 +155,7 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
      "Pregenerate uplink signals after attach. Improves CPU performance.")
 
     ("expert.rssi_sensor_enabled",
-     bpo::value<bool>(&args->expert.phy.rssi_sensor_enabled)->default_value(true),
+     bpo::value<bool>(&args->expert.phy.rssi_sensor_enabled)->default_value(false),
      "Enable or disable RF frontend RSSI sensor. In some USRP devices can cause segmentation fault")
 
     ("expert.prach_gain",

From a570e63c5b8977a8f864cde6d7103a2233ae8c83 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Fri, 20 Oct 2017 15:05:04 +0200
Subject: [PATCH 49/55] Corrected bug in RA

---
 lib/include/srslte/phy/phch/ra.h |  2 --
 lib/src/phy/phch/ra.c            | 14 ++++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/include/srslte/phy/phch/ra.h b/lib/include/srslte/phy/phch/ra.h
index 680d80dd6..aef8f4751 100644
--- a/lib/include/srslte/phy/phch/ra.h
+++ b/lib/include/srslte/phy/phch/ra.h
@@ -103,9 +103,7 @@ typedef struct SRSLTE_API {
   bool prb_idx[2][SRSLTE_MAX_PRB];
   uint32_t nof_prb;  
   uint32_t Qm[SRSLTE_MAX_CODEWORDS];
-  uint32_t Qm2[SRSLTE_MAX_CODEWORDS];
   srslte_ra_mcs_t mcs[SRSLTE_MAX_CODEWORDS];
-  srslte_ra_mcs_t mcs2[SRSLTE_MAX_CODEWORDS];
   uint32_t nof_tb;
   srslte_sf_t sf_type;
   bool tb_en[SRSLTE_MAX_CODEWORDS];
diff --git a/lib/src/phy/phch/ra.c b/lib/src/phy/phch/ra.c
index 8cae9fe65..913bd9548 100644
--- a/lib/src/phy/phch/ra.c
+++ b/lib/src/phy/phch/ra.c
@@ -549,7 +549,7 @@ static int dl_dci_to_grant_mcs(srslte_ra_dl_dci_t *dci, srslte_ra_dl_grant_t *gr
   }
   grant->pinfo = dci->pinfo;
 
-  if (tbs < 0) {
+  if (grant->mcs[0].tbs < 0 || grant->mcs[1].tbs < 0) {
     return SRSLTE_ERROR; 
   } else {    
     return SRSLTE_SUCCESS; 
@@ -584,10 +584,12 @@ int srslte_ra_dl_dci_to_grant(srslte_ra_dl_dci_t *dci,
   if (msg_rnti >= SRSLTE_CRNTI_START && msg_rnti <= SRSLTE_CRNTI_END) {
     crc_is_crnti = true; 
   }
-  // Compute PRB allocation 
-  if (!srslte_ra_dl_dci_to_grant_prb_allocation(dci, grant, nof_prb)) {
-    // Compute MCS 
-    if (!dl_dci_to_grant_mcs(dci, grant, crc_is_crnti)) {            
+  // Compute PRB allocation
+  int ret =srslte_ra_dl_dci_to_grant_prb_allocation(dci, grant, nof_prb);
+  if (!ret) {
+    // Compute MCS
+    ret = dl_dci_to_grant_mcs(dci, grant, crc_is_crnti);
+    if (ret == SRSLTE_SUCCESS) {
       // Apply Section 7.1.7.3. If RA-RNTI and Format1C rv_idx=0
       if (msg_rnti >= SRSLTE_RARNTI_START && msg_rnti <= SRSLTE_RARNTI_END && 
         dci->dci_is_1c) 
@@ -869,4 +871,4 @@ void srslte_ra_prb_fprint(FILE *f, srslte_ra_dl_grant_t *grant) {
     }
   }
   
-}
\ No newline at end of file
+}

From 3b4649b9f730786d010962b33b924693137d8925 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 20 Oct 2017 09:29:56 -0400
Subject: [PATCH 50/55] Fixed bug in calc_new_transmission

---
 srsue/hdr/mac/dl_harq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srsue/hdr/mac/dl_harq.h b/srsue/hdr/mac/dl_harq.h
index 9abcf0922..521018d73 100644
--- a/srsue/hdr/mac/dl_harq.h
+++ b/srsue/hdr/mac/dl_harq.h
@@ -347,7 +347,7 @@ private:
       // Determine if it's a new transmission 5.3.2.2
       bool calc_is_new_transmission(Tgrant grant) {
 
-        if (grant.phy_grant.dl.mcs[tid].idx < 28 &&          // mcs 29,30,31 always retx regardless of rest
+        if (grant.phy_grant.dl.mcs[tid].idx <= 28 &&          // mcs 29,30,31 always retx regardless of rest
             ((grant.ndi[tid] != cur_grant.ndi[tid])       || // 1st condition (NDI has changed)
              (pid == HARQ_BCCH_PID && grant.rv[tid] == 0) || // 2nd condition (Broadcast and 1st transmission)
              is_first_tb))

From c8bba2f4d07ada34376ff19c19c10a9cf50f7a17 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier.arteaga@softwareradiosystems.com>
Date: Fri, 20 Oct 2017 16:09:27 +0200
Subject: [PATCH 51/55] DFT optimization. memcpy removal (#76)

* Solved PHICH Segmentation fault for MIMO

* Initial Guru FFT optimitzation

* Guru (i)FFT implemented. All test passed!

* Integrated new DFT into pdsch_enodeb and pdsch_ue. Solved more DFT bugs.

* Solved Merge Errors and bugs

* Solved UL Guru bug (DC missing). Updated Init and OFDM calls for enb and ue (cell measurement too).
---
 lib/examples/cell_measurement.c               |  28 +-
 lib/examples/pdsch_enodeb.c                   |  26 +-
 lib/examples/pdsch_ue.c                       |  13 +-
 lib/include/srslte/phy/common/phy_common.h    |   4 +-
 lib/include/srslte/phy/dft/dft.h              |  24 ++
 lib/include/srslte/phy/dft/ofdm.h             |  40 +--
 lib/include/srslte/phy/enb/enb_dl.h           |   8 +-
 lib/include/srslte/phy/enb/enb_ul.h           |   1 +
 lib/include/srslte/phy/ue/ue_dl.h             |   3 +-
 lib/include/srslte/phy/ue/ue_mib.h            |   2 +-
 lib/include/srslte/phy/ue/ue_ul.h             |   1 +
 lib/src/phy/dft/dft_fftw.c                    |  62 +++-
 lib/src/phy/dft/ofdm.c                        | 271 +++++++++++++++---
 lib/src/phy/dft/test/ofdm_test.c              |  63 +++-
 lib/src/phy/enb/enb_dl.c                      |  57 ++--
 lib/src/phy/enb/enb_ul.c                      |  31 +-
 lib/src/phy/phch/test/pbch_file_test.c        |   4 +-
 lib/src/phy/phch/test/pcfich_file_test.c      |  10 +-
 lib/src/phy/phch/test/pdcch_file_test.c       |   6 +-
 lib/src/phy/phch/test/pdsch_pdcch_file_test.c |   4 +-
 lib/src/phy/phch/test/phich_file_test.c       |   6 +-
 lib/src/phy/phch/test/pmch_file_test.c        |   2 +-
 lib/src/phy/phch/test/pmch_test.c             | 122 ++++----
 lib/src/phy/sync/test/sync_test.c             |  12 +-
 lib/src/phy/ue/ue_dl.c                        |  77 ++---
 lib/src/phy/ue/ue_mib.c                       |  13 +-
 lib/src/phy/ue/ue_ul.c                        |  22 +-
 srsenb/hdr/phy/phch_worker.h                  |   2 +-
 srsenb/src/phy/phch_worker.cc                 |  19 +-
 srsue/src/phy/phch_recv.cc                    |   8 +-
 srsue/src/phy/phch_worker.cc                  |   4 +-
 31 files changed, 647 insertions(+), 298 deletions(-)

diff --git a/lib/examples/cell_measurement.c b/lib/examples/cell_measurement.c
index 37796682f..bfb8194df 100644
--- a/lib/examples/cell_measurement.c
+++ b/lib/examples/cell_measurement.c
@@ -249,7 +249,7 @@ int main(int argc, char **argv) {
     fprintf(stderr, "Error initiating ue_sync\n");
     return -1;
   }
-  if (srslte_ue_dl_init(&ue_dl, cell.nof_prb, 1)) {
+  if (srslte_ue_dl_init(&ue_dl, sf_buffer, cell.nof_prb, 1)) {
     fprintf(stderr, "Error initiating UE downlink processing module\n");
     return -1;
   }
@@ -257,7 +257,7 @@ int main(int argc, char **argv) {
     fprintf(stderr, "Error initiating UE downlink processing module\n");
     return -1;
   }
-  if (srslte_ue_mib_init(&ue_mib, cell.nof_prb)) {
+  if (srslte_ue_mib_init(&ue_mib, sf_buffer, cell.nof_prb)) {
     fprintf(stderr, "Error initaiting UE MIB decoder\n");
     return -1;
   }
@@ -271,8 +271,16 @@ int main(int argc, char **argv) {
 
   /* Initialize subframe counter */
   sf_cnt = 0;
-    
-  if (srslte_ofdm_rx_init(&fft, cell.cp, cell.nof_prb)) {
+
+  int sf_re = SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp);
+
+  cf_t *sf_symbols = srslte_vec_malloc(sf_re * sizeof(cf_t));
+
+  for (int i=0;i<SRSLTE_MAX_PORTS;i++) {
+    ce[i] = srslte_vec_malloc(sizeof(cf_t) * sf_re);
+  }
+
+  if (srslte_ofdm_rx_init(&fft, cell.cp, sf_buffer[0], sf_symbols, cell.nof_prb)) {
     fprintf(stderr, "Error initiating FFT\n");
     return -1;
   }
@@ -284,14 +292,6 @@ int main(int argc, char **argv) {
     fprintf(stderr, "Error initiating channel estimator\n");
     return -1;
   }
-
-  int sf_re = SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp);
-
-  cf_t *sf_symbols = srslte_vec_malloc(sf_re * sizeof(cf_t));
-
-  for (int i=0;i<SRSLTE_MAX_PORTS;i++) {
-    ce[i] = srslte_vec_malloc(sizeof(cf_t) * sf_re);
-  }
   
   srslte_rf_start_rx_stream(&rf);
   
@@ -315,7 +315,7 @@ int main(int argc, char **argv) {
         case DECODE_MIB:
           if (srslte_ue_sync_get_sfidx(&ue_sync) == 0) {
             srslte_pbch_decode_reset(&ue_mib.pbch);
-            n = srslte_ue_mib_decode(&ue_mib, sf_buffer[0], bch_payload, NULL, &sfn_offset);
+            n = srslte_ue_mib_decode(&ue_mib, bch_payload, NULL, &sfn_offset);
             if (n < 0) {
               fprintf(stderr, "Error decoding UE MIB\n");
               return -1;
@@ -351,7 +351,7 @@ int main(int argc, char **argv) {
         
         if (srslte_ue_sync_get_sfidx(&ue_sync) == 5) {
           /* Run FFT for all subframe data */
-          srslte_ofdm_rx_sf(&fft, sf_buffer[0], sf_symbols);
+          srslte_ofdm_rx_sf(&fft);
           
           srslte_chest_dl_estimate(&chest, sf_symbols, ce, srslte_ue_sync_get_sfidx(&ue_sync));
                   
diff --git a/lib/examples/pdsch_enodeb.c b/lib/examples/pdsch_enodeb.c
index 98ad78507..f7f17bc2b 100644
--- a/lib/examples/pdsch_enodeb.c
+++ b/lib/examples/pdsch_enodeb.c
@@ -86,7 +86,7 @@ float rf_amp = 0.8, rf_gain = 70.0, rf_freq = 2400000000;
 
 bool null_file_sink=false; 
 srslte_filesink_t fsink;
-srslte_ofdm_t ifft;
+srslte_ofdm_t ifft[SRSLTE_MAX_PORTS];
 srslte_ofdm_t ifft_mbsfn;
 srslte_pbch_t pbch;
 srslte_pcfich_t pcfich;
@@ -311,18 +311,21 @@ void base_init() {
   }
 
   /* create ifft object */
-  if (srslte_ofdm_tx_init(&ifft, SRSLTE_CP_NORM, cell.nof_prb)) {
-    fprintf(stderr, "Error creating iFFT object\n");
-    exit(-1);
+  for (i = 0; i < cell.nof_ports; i++) {
+    if (srslte_ofdm_tx_init(&ifft[i], SRSLTE_CP_NORM, sf_buffer[i], output_buffer[i], cell.nof_prb)) {
+      fprintf(stderr, "Error creating iFFT object\n");
+      exit(-1);
+    }
+
+    srslte_ofdm_set_normalize(&ifft[i], true);
   }
-  if (srslte_ofdm_tx_init_mbsfn(&ifft_mbsfn, SRSLTE_CP_EXT, cell.nof_prb)) {
+
+  if (srslte_ofdm_tx_init_mbsfn(&ifft_mbsfn, SRSLTE_CP_EXT, sf_buffer[0], output_buffer[0], cell.nof_prb)) {
     fprintf(stderr, "Error creating iFFT object\n");
     exit(-1);
   }
   srslte_ofdm_set_non_mbsfn_region(&ifft_mbsfn, 2);
   srslte_ofdm_set_normalize(&ifft_mbsfn, true);
-  srslte_ofdm_set_normalize(&ifft, true);
-  
   
   if (srslte_pbch_init(&pbch)) {
     fprintf(stderr, "Error creating PBCH object\n");
@@ -413,8 +416,9 @@ void base_free() {
     srslte_pmch_free(&pmch); 
   }
   srslte_ofdm_tx_free(&ifft_mbsfn);
-  srslte_ofdm_tx_free(&ifft);
-  
+  for (i = 0; i < cell.nof_ports; i++) {
+    srslte_ofdm_tx_free(&ifft[i]);
+  }
 
   for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
     if (data[i]) {
@@ -977,10 +981,10 @@ int main(int argc, char **argv) {
       /* Transform to OFDM symbols */
       if(sf_idx != 1 || mbsfn_area_id < 0){
         for (i = 0; i < cell.nof_ports; i++) {
-          srslte_ofdm_tx_sf(&ifft, sf_buffer[i], output_buffer[i]);
+          srslte_ofdm_tx_sf(&ifft[i]);
         }
       }else{
-        srslte_ofdm_tx_sf(&ifft_mbsfn, sf_buffer[0], output_buffer[0]);
+        srslte_ofdm_tx_sf(&ifft_mbsfn);
       }
       
       /* send to file or usrp */
diff --git a/lib/examples/pdsch_ue.c b/lib/examples/pdsch_ue.c
index 9d7282cdd..bf4414f08 100644
--- a/lib/examples/pdsch_ue.c
+++ b/lib/examples/pdsch_ue.c
@@ -510,7 +510,10 @@ int main(int argc, char **argv) {
 #endif
   }
 
-  if (srslte_ue_mib_init(&ue_mib, cell.nof_prb)) {
+  for (int i=0;i<prog_args.rf_nof_rx_ant;i++) {
+    sf_buffer[i] = srslte_vec_malloc(3*sizeof(cf_t)*SRSLTE_SF_LEN_PRB(cell.nof_prb));
+  }
+  if (srslte_ue_mib_init(&ue_mib, sf_buffer, cell.nof_prb)) {
     fprintf(stderr, "Error initaiting UE MIB decoder\n");
     exit(-1);
   }
@@ -519,7 +522,7 @@ int main(int argc, char **argv) {
     exit(-1);
   }
 
-  if (srslte_ue_dl_init(&ue_dl, cell.nof_prb, prog_args.rf_nof_rx_ant)) {
+  if (srslte_ue_dl_init(&ue_dl, sf_buffer, cell.nof_prb, prog_args.rf_nof_rx_ant)) {
     fprintf(stderr, "Error initiating UE downlink processing module\n");
     exit(-1);
   }
@@ -527,10 +530,6 @@ int main(int argc, char **argv) {
     fprintf(stderr, "Error initiating UE downlink processing module\n");
     exit(-1);
   }
-
-  for (int i=0;i<prog_args.rf_nof_rx_ant;i++) {
-    sf_buffer[i] = srslte_vec_malloc(3*sizeof(cf_t)*SRSLTE_SF_LEN_PRB(cell.nof_prb));
-  }
   
   /* Configure downlink receiver for the SI-RNTI since will be the only one we'll use */
   srslte_ue_dl_set_rnti(&ue_dl, prog_args.rnti); 
@@ -632,7 +631,7 @@ int main(int argc, char **argv) {
       switch (state) {
         case DECODE_MIB:
           if (sfidx == 0) {
-            n = srslte_ue_mib_decode(&ue_mib, sf_buffer[0], bch_payload, NULL, &sfn_offset);
+            n = srslte_ue_mib_decode(&ue_mib, bch_payload, NULL, &sfn_offset);
             if (n < 0) {
               fprintf(stderr, "Error decoding UE MIB\n");
               exit(-1);
diff --git a/lib/include/srslte/phy/common/phy_common.h b/lib/include/srslte/phy/common/phy_common.h
index 148a12974..01d66d72a 100644
--- a/lib/include/srslte/phy/common/phy_common.h
+++ b/lib/include/srslte/phy/common/phy_common.h
@@ -101,8 +101,8 @@ typedef enum {SRSLTE_SF_NORM, SRSLTE_SF_MBSFN} srslte_sf_t;
 #define SRSLTE_CP_ISEXT(cp) (cp==SRSLTE_CP_EXT)
 #define SRSLTE_CP_NSYMB(cp) (SRSLTE_CP_ISNORM(cp)?SRSLTE_CP_NORM_NSYMB:SRSLTE_CP_EXT_NSYMB)
 
-#define SRSLTE_CP_LEN(symbol_sz, c)           ((int) ceil((((float) (c)*(symbol_sz))/2048)))
-#define SRSLTE_CP_LEN_NORM(symbol, symbol_sz) ((symbol==0)?SRSLTE_CP_LEN((symbol_sz),SRSLTE_CP_NORM_0_LEN):SRSLTE_CP_LEN((symbol_sz),SRSLTE_CP_NORM_LEN))
+#define SRSLTE_CP_LEN(symbol_sz, c)           ((int) ceilf((((float) (c)*(symbol_sz))/2048.0f)))
+#define SRSLTE_CP_LEN_NORM(symbol, symbol_sz) (((symbol)==0)?SRSLTE_CP_LEN((symbol_sz),SRSLTE_CP_NORM_0_LEN):SRSLTE_CP_LEN((symbol_sz),SRSLTE_CP_NORM_LEN))
 #define SRSLTE_CP_LEN_EXT(symbol_sz)          (SRSLTE_CP_LEN((symbol_sz),SRSLTE_CP_EXT_LEN))
 
 #define SRSLTE_SLOT_LEN(symbol_sz)     (symbol_sz*15/2)
diff --git a/lib/include/srslte/phy/dft/dft.h b/lib/include/srslte/phy/dft/dft.h
index b7fc663d8..b3dd4378b 100644
--- a/lib/include/srslte/phy/dft/dft.h
+++ b/lib/include/srslte/phy/dft/dft.h
@@ -63,6 +63,7 @@ typedef struct SRSLTE_API {
   void *in;           // Input buffer
   void *out;          // Output buffer
   void *p;            // DFT plan
+  bool is_guru;
   bool forward;       // Forward transform?
   bool mirror;        // Shift negative and positive frequencies?
   bool db;            // Provide output in dB?
@@ -85,6 +86,17 @@ SRSLTE_API int srslte_dft_plan_c(srslte_dft_plan_t *plan,
                                  int dft_points, 
                                  srslte_dft_dir_t dir);
 
+SRSLTE_API int srslte_dft_plan_guru_c(srslte_dft_plan_t *plan,
+                                      int dft_points,
+                                      srslte_dft_dir_t dir,
+                                      cf_t *in_buffer,
+                                      cf_t *out_buffer,
+                                      int istride,
+                                      int ostride,
+                                      int how_many,
+                                      int idist,
+                                      int odist);
+
 SRSLTE_API int srslte_dft_plan_r(srslte_dft_plan_t *plan, 
                                  int dft_points, 
                                  srslte_dft_dir_t dir);
@@ -92,6 +104,16 @@ SRSLTE_API int srslte_dft_plan_r(srslte_dft_plan_t *plan,
 SRSLTE_API int srslte_dft_replan(srslte_dft_plan_t *plan,
                                  const int new_dft_points);
 
+SRSLTE_API int srslte_dft_replan_guru_c(srslte_dft_plan_t *plan,
+                                        const int new_dft_points,
+                                        cf_t *in_buffer,
+                                        cf_t *out_buffer,
+                                        int istride,
+                                        int ostride,
+                                        int how_many,
+                                        int idist,
+                                        int odist);
+
 SRSLTE_API int srslte_dft_replan_c(srslte_dft_plan_t *plan,
                                    int new_dft_points);
 
@@ -129,6 +151,8 @@ SRSLTE_API void srslte_dft_run_c(srslte_dft_plan_t *plan,
                                  cf_t *in, 
                                  cf_t *out);
 
+SRSLTE_API void srslte_dft_run_guru_c(srslte_dft_plan_t *plan);
+
 SRSLTE_API void srslte_dft_run_r(srslte_dft_plan_t *plan, 
                                  float *in, 
                                  float *out);
diff --git a/lib/include/srslte/phy/dft/ofdm.h b/lib/include/srslte/phy/dft/ofdm.h
index 1363f5638..cba963ca4 100644
--- a/lib/include/srslte/phy/dft/ofdm.h
+++ b/lib/include/srslte/phy/dft/ofdm.h
@@ -47,14 +47,18 @@
 /* This is common for both directions */
 typedef struct SRSLTE_API{
   srslte_dft_plan_t fft_plan;
+  srslte_dft_plan_t fft_plan_sf[2];
   uint32_t max_prb;
   uint32_t nof_symbols;
   uint32_t symbol_sz;
   uint32_t nof_guards;
   uint32_t nof_re;
   uint32_t slot_sz;
+  uint32_t sf_sz;
   srslte_cp_t cp;
   cf_t *tmp; // for removing zero padding
+  cf_t *in_buffer;
+  cf_t *out_buffer;
   
   bool     mbsfn_subframe;
   uint32_t mbsfn_guard_len;
@@ -69,12 +73,16 @@ typedef struct SRSLTE_API{
 
 SRSLTE_API int srslte_ofdm_init_(srslte_ofdm_t *q, 
                                  srslte_cp_t cp, 
+                                 cf_t *in_buffer,
+                                 cf_t *out_buffer,
                                  int symbol_sz, 
                                  int nof_prb, 
                                  srslte_dft_dir_t dir);
 
 SRSLTE_API int srslte_ofdm_init_mbsfn_(srslte_ofdm_t *q, 
                                  srslte_cp_t cp, 
+                                 cf_t *in_buffer,
+                                 cf_t *out_buffer,
                                  int symbol_sz, 
                                  int nof_prb, 
                                  srslte_dft_dir_t dir,
@@ -82,12 +90,14 @@ SRSLTE_API int srslte_ofdm_init_mbsfn_(srslte_ofdm_t *q,
 
 SRSLTE_API int srslte_ofdm_rx_init_mbsfn(srslte_ofdm_t *q,
                                          srslte_cp_t cp_type,
+                                         cf_t *in_buffer,
+                                         cf_t *out_buffer,
                                          uint32_t nof_prb);
 
-
-
 SRSLTE_API int srslte_ofdm_rx_init(srslte_ofdm_t *q, 
                                srslte_cp_t cp_type, 
+                               cf_t *in_buffer,
+                               cf_t *out_buffer,
                                uint32_t max_prb);
 
 SRSLTE_API int srslte_ofdm_tx_set_prb(srslte_ofdm_t *q,
@@ -100,39 +110,35 @@ SRSLTE_API int srslte_ofdm_rx_set_prb(srslte_ofdm_t *q,
 
 SRSLTE_API void srslte_ofdm_rx_free(srslte_ofdm_t *q);
 
-SRSLTE_API void srslte_ofdm_rx_slot(srslte_ofdm_t *q, 
-                                    cf_t *input, 
-                                    cf_t *output);
-
-SRSLTE_API void srslte_ofdm_rx_sf(srslte_ofdm_t *q, 
-                                  cf_t *input, 
-                                  cf_t *output);
-
+SRSLTE_API void srslte_ofdm_rx_slot(srslte_ofdm_t *q,
+                                    int slot_in_sf);
 
+SRSLTE_API void srslte_ofdm_rx_sf(srslte_ofdm_t *q);
 
 SRSLTE_API int srslte_ofdm_tx_init(srslte_ofdm_t *q, 
                                     srslte_cp_t cp_type, 
+                                    cf_t *in_buffer,
+                                    cf_t *out_buffer,
                                     uint32_t nof_prb);
 
 SRSLTE_API int srslte_ofdm_tx_init_mbsfn(srslte_ofdm_t *q,
                                         srslte_cp_t cp, 
+                                         cf_t *in_buffer,
+                                         cf_t *out_buffer,
                                         uint32_t nof_prb);
 
 
 SRSLTE_API void srslte_ofdm_tx_free(srslte_ofdm_t *q);
 
-SRSLTE_API void srslte_ofdm_tx_slot(srslte_ofdm_t *q, 
-                                  cf_t *input, 
-                                  cf_t *output);
+SRSLTE_API void srslte_ofdm_tx_slot(srslte_ofdm_t *q,
+                                    int slot_in_sf);
 
 SRSLTE_API void srslte_ofdm_tx_slot_mbsfn(srslte_ofdm_t *q,
                                          cf_t *input,
                                          cf_t *output);
 
 
-SRSLTE_API void srslte_ofdm_tx_sf(srslte_ofdm_t *q, 
-                                cf_t *input, 
-                                cf_t *output);
+SRSLTE_API void srslte_ofdm_tx_sf(srslte_ofdm_t *q);
 
 SRSLTE_API int srslte_ofdm_set_freq_shift(srslte_ofdm_t *q, 
                                          float freq_shift); 
@@ -144,4 +150,4 @@ SRSLTE_API void srslte_ofdm_set_non_mbsfn_region(srslte_ofdm_t *q,
                                                uint8_t non_mbsfn_region);
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/include/srslte/phy/enb/enb_dl.h b/lib/include/srslte/phy/enb/enb_dl.h
index 7c3166e5c..8d4f3a73a 100644
--- a/lib/include/srslte/phy/enb/enb_dl.h
+++ b/lib/include/srslte/phy/enb/enb_dl.h
@@ -68,7 +68,7 @@ typedef struct SRSLTE_API {
   cf_t *sf_symbols[SRSLTE_MAX_PORTS]; 
   cf_t *slot1_symbols[SRSLTE_MAX_PORTS];
   
-  srslte_ofdm_t   ifft;
+  srslte_ofdm_t   ifft[SRSLTE_MAX_PORTS];
   srslte_pbch_t   pbch;
   srslte_pcfich_t pcfich;
   srslte_regs_t   regs;
@@ -109,7 +109,8 @@ typedef struct {
 } srslte_enb_dl_phich_t; 
 
 /* This function shall be called just after the initial synchronization */
-SRSLTE_API int srslte_enb_dl_init(srslte_enb_dl_t *q, 
+SRSLTE_API int srslte_enb_dl_init(srslte_enb_dl_t *q,
+                                  cf_t *out_buffer[SRSLTE_MAX_PORTS],
                                   uint32_t max_prb);
 
 SRSLTE_API void srslte_enb_dl_free(srslte_enb_dl_t *q);
@@ -146,8 +147,7 @@ SRSLTE_API void srslte_enb_dl_put_phich(srslte_enb_dl_t *q,
 SRSLTE_API void srslte_enb_dl_put_base(srslte_enb_dl_t *q, 
                                        uint32_t tti);
 
-SRSLTE_API void srslte_enb_dl_gen_signal(srslte_enb_dl_t *q, 
-                                         cf_t *signal_buffer); 
+SRSLTE_API void srslte_enb_dl_gen_signal(srslte_enb_dl_t *q);
 
 SRSLTE_API int srslte_enb_dl_add_rnti(srslte_enb_dl_t *q, 
                                       uint16_t rnti); 
diff --git a/lib/include/srslte/phy/enb/enb_ul.h b/lib/include/srslte/phy/enb/enb_ul.h
index 855b645ca..0957ccbc0 100644
--- a/lib/include/srslte/phy/enb/enb_ul.h
+++ b/lib/include/srslte/phy/enb/enb_ul.h
@@ -101,6 +101,7 @@ typedef struct {
 
 /* This function shall be called just after the initial synchronization */
 SRSLTE_API int srslte_enb_ul_init(srslte_enb_ul_t *q,
+                                  cf_t *in_buffer,
                                   uint32_t max_prb);
 
 SRSLTE_API void srslte_enb_ul_free(srslte_enb_ul_t *q);
diff --git a/lib/include/srslte/phy/ue/ue_dl.h b/lib/include/srslte/phy/ue/ue_dl.h
index 0b0fc44d2..0486ad7d4 100644
--- a/lib/include/srslte/phy/ue/ue_dl.h
+++ b/lib/include/srslte/phy/ue/ue_dl.h
@@ -80,7 +80,7 @@ typedef struct SRSLTE_API {
   srslte_pmch_t  pmch;
   srslte_phich_t phich; 
   srslte_regs_t regs;
-  srslte_ofdm_t fft;
+  srslte_ofdm_t fft[SRSLTE_MAX_PORTS];
   srslte_ofdm_t fft_mbsfn;
   srslte_chest_dl_t chest;
   
@@ -128,6 +128,7 @@ typedef struct SRSLTE_API {
 
 /* This function shall be called just after the initial synchronization */
 SRSLTE_API int srslte_ue_dl_init(srslte_ue_dl_t *q,
+                                 cf_t *in_buffer[SRSLTE_MAX_PORTS],
                                  uint32_t max_prb,
                                  uint32_t nof_rx_antennas);
 
diff --git a/lib/include/srslte/phy/ue/ue_mib.h b/lib/include/srslte/phy/ue/ue_mib.h
index 5d2ef14fa..ee52b31d4 100644
--- a/lib/include/srslte/phy/ue/ue_mib.h
+++ b/lib/include/srslte/phy/ue/ue_mib.h
@@ -79,6 +79,7 @@ typedef struct SRSLTE_API {
 } srslte_ue_mib_t;
 
 SRSLTE_API int srslte_ue_mib_init(srslte_ue_mib_t *q, 
+                                  cf_t *in_buffer[SRSLTE_MAX_PORTS],
                                   uint32_t max_prb);
 
 SRSLTE_API void srslte_ue_mib_free(srslte_ue_mib_t *q);
@@ -89,7 +90,6 @@ SRSLTE_API int srslte_ue_mib_set_cell(srslte_ue_mib_t * q,
 SRSLTE_API void srslte_ue_mib_reset(srslte_ue_mib_t * q); 
 
 SRSLTE_API int srslte_ue_mib_decode(srslte_ue_mib_t * q, 
-                                    cf_t *input, 
                                     uint8_t bch_payload[SRSLTE_BCH_PAYLOAD_LEN], 
                                     uint32_t *nof_tx_ports, 
                                     int *sfn_offset); 
diff --git a/lib/include/srslte/phy/ue/ue_ul.h b/lib/include/srslte/phy/ue/ue_ul.h
index 3492180e3..617833a61 100644
--- a/lib/include/srslte/phy/ue/ue_ul.h
+++ b/lib/include/srslte/phy/ue/ue_ul.h
@@ -108,6 +108,7 @@ typedef struct SRSLTE_API {
 
 /* This function shall be called just after the initial synchronization */
 SRSLTE_API int srslte_ue_ul_init(srslte_ue_ul_t *q,
+                                 cf_t *out_buffer,
                                  uint32_t max_prb);
 
 SRSLTE_API void srslte_ue_ul_free(srslte_ue_ul_t *q);
diff --git a/lib/src/phy/dft/dft_fftw.c b/lib/src/phy/dft/dft_fftw.c
index b4a627742..a06dd397f 100644
--- a/lib/src/phy/dft/dft_fftw.c
+++ b/lib/src/phy/dft/dft_fftw.c
@@ -93,6 +93,27 @@ static void allocate(srslte_dft_plan_t *plan, int size_in, int size_out, int len
   plan->out = fftwf_malloc(size_out*len);
 }
 
+int srslte_dft_replan_guru_c(srslte_dft_plan_t *plan, const int new_dft_points, cf_t *in_buffer,
+                             cf_t *out_buffer, int istride, int ostride, int how_many,
+                             int idist, int odist) {
+  int sign = (plan->forward) ? FFTW_FORWARD : FFTW_BACKWARD;
+
+  const fftwf_iodim iodim = {new_dft_points, istride, ostride};
+  const fftwf_iodim howmany_dims = {how_many, idist, odist};
+
+  /* Destroy current plan */
+  fftwf_destroy_plan(plan->p);
+
+  plan->p = fftwf_plan_guru_dft(1, &iodim, 1, &howmany_dims, in_buffer, out_buffer, sign, FFTW_TYPE);
+  if (!plan->p) {
+    return -1;
+  }
+  plan->size = new_dft_points;
+  plan->init_size = plan->size;
+
+  return 0;
+}
+
 int srslte_dft_replan_c(srslte_dft_plan_t *plan, const int new_dft_points) {
   int sign = (plan->dir == SRSLTE_DFT_FORWARD) ? FFTW_FORWARD : FFTW_BACKWARD;
   if (plan->p) {
@@ -107,6 +128,32 @@ int srslte_dft_replan_c(srslte_dft_plan_t *plan, const int new_dft_points) {
   return 0;
 }
 
+int srslte_dft_plan_guru_c(srslte_dft_plan_t *plan, const int dft_points, srslte_dft_dir_t dir, cf_t *in_buffer,
+                           cf_t *out_buffer, int istride, int ostride, int how_many,
+                           int idist, int odist) {
+  int sign = (dir == SRSLTE_DFT_FORWARD) ? FFTW_FORWARD : FFTW_BACKWARD;
+
+  const fftwf_iodim iodim = {dft_points, istride, ostride};
+  const fftwf_iodim howmany_dims = {how_many, idist, odist};
+
+  plan->p = fftwf_plan_guru_dft(1, &iodim, 1, &howmany_dims, in_buffer, out_buffer, sign, FFTW_TYPE);
+  if (!plan->p) {
+    return -1;
+  }
+  plan->size = dft_points;
+  plan->init_size = plan->size;
+  plan->mode = SRSLTE_DFT_COMPLEX;
+  plan->dir = dir;
+  plan->forward = (dir==SRSLTE_DFT_FORWARD)?true:false;
+  plan->mirror = false;
+  plan->db = false;
+  plan->norm = false;
+  plan->dc = false;
+  plan->is_guru = true;
+
+  return 0;
+}
+
 int srslte_dft_plan_c(srslte_dft_plan_t *plan, const int dft_points, srslte_dft_dir_t dir) {
   allocate(plan,sizeof(fftwf_complex),sizeof(fftwf_complex), dft_points);
   int sign = (dir == SRSLTE_DFT_FORWARD) ? FFTW_FORWARD : FFTW_BACKWARD;
@@ -123,6 +170,7 @@ int srslte_dft_plan_c(srslte_dft_plan_t *plan, const int dft_points, srslte_dft_
   plan->db = false;
   plan->norm = false;
   plan->dc = false;
+  plan->is_guru = false;
 
   return 0;
 }
@@ -232,6 +280,14 @@ void srslte_dft_run_c(srslte_dft_plan_t *plan, cf_t *in, cf_t *out) {
             plan->forward, plan->mirror, plan->dc);
 }
 
+void srslte_dft_run_guru_c(srslte_dft_plan_t *plan) {
+  if (plan->is_guru == true) {
+    fftwf_execute(plan->p);
+  } else {
+    fprintf(stderr, "srslte_dft_run_guru_c: the selected plan is not guru!\n");
+  }
+}
+
 void srslte_dft_run_r(srslte_dft_plan_t *plan, float *in, float *out) {
   float norm;
   int i;
@@ -255,8 +311,10 @@ void srslte_dft_run_r(srslte_dft_plan_t *plan, float *in, float *out) {
 void srslte_dft_plan_free(srslte_dft_plan_t *plan) {
   if (!plan) return;
   if (!plan->size) return;
-  if (plan->in) fftwf_free(plan->in);
-  if (plan->out) fftwf_free(plan->out);
+  if (!plan->is_guru) {
+    if (plan->in) fftwf_free(plan->in);
+    if (plan->out) fftwf_free(plan->out);
+  }
   if (plan->p) fftwf_destroy_plan(plan->p);
   bzero(plan, sizeof(srslte_dft_plan_t));
 }
diff --git a/lib/src/phy/dft/ofdm.c b/lib/src/phy/dft/ofdm.c
index db5939274..8ea690bb5 100644
--- a/lib/src/phy/dft/ofdm.c
+++ b/lib/src/phy/dft/ofdm.c
@@ -37,23 +37,79 @@
 #include "srslte/phy/utils/debug.h"
 #include "srslte/phy/utils/vector.h"
 
+/* Uncomment next line for avoiding Guru DFT call */
+//#define AVOID_GURU
 
-int srslte_ofdm_init_(srslte_ofdm_t *q, srslte_cp_t cp, int symbol_sz, int nof_prb, srslte_dft_dir_t dir) {
-  return srslte_ofdm_init_mbsfn_(q, cp, symbol_sz, nof_prb, dir, SRSLTE_SF_NORM);
+int srslte_ofdm_init_(srslte_ofdm_t *q, srslte_cp_t cp, cf_t *in_buffer, cf_t *out_buffer, int symbol_sz, int nof_prb, srslte_dft_dir_t dir) {
+  return srslte_ofdm_init_mbsfn_(q, cp, in_buffer, out_buffer, symbol_sz, nof_prb, dir, SRSLTE_SF_NORM);
 }
 
+int srslte_ofdm_init_mbsfn_(srslte_ofdm_t *q, srslte_cp_t cp, cf_t *in_buffer, cf_t *out_buffer, int symbol_sz, int nof_prb, srslte_dft_dir_t dir, srslte_sf_t sf_type) {
 
-int srslte_ofdm_init_mbsfn_(srslte_ofdm_t *q, srslte_cp_t cp, int symbol_sz, int nof_prb, srslte_dft_dir_t dir, srslte_sf_t sf_type) {
+  /* Set OFDM object attributes */
+  q->symbol_sz = (uint32_t) symbol_sz;
+  q->nof_symbols = SRSLTE_CP_NSYMB(cp);
+  q->nof_symbols_mbsfn = SRSLTE_CP_NSYMB(SRSLTE_CP_EXT);
+  q->cp = cp;
+  q->freq_shift = false;
+  q->nof_re = (uint32_t) nof_prb * SRSLTE_NRE;
+  q->nof_guards = ((symbol_sz - q->nof_re) / 2);
+  q->slot_sz = (uint32_t) SRSLTE_SLOT_LEN(symbol_sz);
+  q->sf_sz = (uint32_t) SRSLTE_SF_LEN(symbol_sz);
+  q->in_buffer = in_buffer;
+  q->out_buffer= out_buffer;
 
   if (srslte_dft_plan_c(&q->fft_plan, symbol_sz, dir)) {
     fprintf(stderr, "Error: Creating DFT plan\n");
     return -1;
   }
+
+#ifdef AVOID_GURU
   q->tmp = srslte_vec_malloc((uint32_t) symbol_sz * sizeof(cf_t));
   if (!q->tmp) {
     perror("malloc");
     return -1;
   }
+  bzero(q->tmp, sizeof(cf_t) * symbol_sz);
+#else
+  int cp1 = SRSLTE_CP_ISNORM(cp)?SRSLTE_CP_LEN_NORM(0, symbol_sz):SRSLTE_CP_LEN_EXT(symbol_sz);
+  int cp2 = SRSLTE_CP_ISNORM(cp)?SRSLTE_CP_LEN_NORM(1, symbol_sz):SRSLTE_CP_LEN_EXT(symbol_sz);
+
+  q->tmp = srslte_vec_malloc(sizeof(cf_t) * q->sf_sz);
+  if (!q->tmp) {
+    perror("malloc");
+    return -1;
+  }
+  bzero(q->tmp, sizeof(cf_t) * q->sf_sz);
+
+  if (dir == SRSLTE_DFT_BACKWARD) {
+    bzero(in_buffer, sizeof(cf_t) * SRSLTE_SF_LEN_RE(nof_prb, cp));
+  }else {
+    bzero(in_buffer, sizeof(cf_t) * q->sf_sz);
+  }
+
+  for (int slot = 0; slot < 2; slot++) {
+    //bzero(&q->fft_plan_sf[slot], sizeof(srslte_dft_plan_t));
+    //bzero(q->tmp  + SRSLTE_CP_NSYMB(cp)*symbol_sz*slot, sizeof(cf_t) * (cp1 + (SRSLTE_CP_NSYMB(cp) - 1)*cp2 + SRSLTE_CP_NSYMB(cp)*symbol_sz));
+    if (dir == SRSLTE_DFT_FORWARD) {
+      if (srslte_dft_plan_guru_c(&q->fft_plan_sf[slot], symbol_sz, dir,
+                                 in_buffer + cp1 + q->slot_sz * slot,
+                                 q->tmp + q->nof_symbols * q->symbol_sz * slot,
+                                 1, 1, SRSLTE_CP_NSYMB(cp), symbol_sz + cp2, symbol_sz)) {
+        fprintf(stderr, "Error: Creating DFT plan (1)\n");
+        return -1;
+      }
+    } else {
+      if (srslte_dft_plan_guru_c(&q->fft_plan_sf[slot], symbol_sz, dir,
+                                 q->tmp + q->nof_symbols * q->symbol_sz * slot,
+                                 out_buffer + cp1 + q->slot_sz * slot,
+                                 1, 1, SRSLTE_CP_NSYMB(cp), symbol_sz, symbol_sz + cp2)) {
+        fprintf(stderr, "Error: Creating DFT plan (1)\n");
+        return -1;
+      }
+    }
+  }
+#endif
 
   q->shift_buffer = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN(symbol_sz));
   if (!q->shift_buffer) {
@@ -64,15 +120,6 @@ int srslte_ofdm_init_mbsfn_(srslte_ofdm_t *q, srslte_cp_t cp, int symbol_sz, int
   srslte_dft_plan_set_mirror(&q->fft_plan, true);
   srslte_dft_plan_set_dc(&q->fft_plan, true);
 
-  q->symbol_sz = (uint32_t) symbol_sz;
-  q->nof_symbols = SRSLTE_CP_NSYMB(cp);
-  q->nof_symbols_mbsfn = SRSLTE_CP_NSYMB(SRSLTE_CP_EXT);
-  q->cp = cp;
-  q->freq_shift = false;
-  q->nof_re = nof_prb * SRSLTE_NRE;
-  q->nof_guards = ((symbol_sz - q->nof_re) / 2);
-  q->slot_sz = SRSLTE_SLOT_LEN(symbol_sz);
-  
   DEBUG("Init %s symbol_sz=%d, nof_symbols=%d, cp=%s, nof_re=%d, nof_guards=%d\n",
       dir==SRSLTE_DFT_FORWARD?"FFT":"iFFT", q->symbol_sz, q->nof_symbols,
           q->cp==SRSLTE_CP_NORM?"Normal":"Extended", q->nof_re, q->nof_guards);
@@ -101,9 +148,60 @@ int srslte_ofdm_replan_(srslte_ofdm_t *q, srslte_cp_t cp, int symbol_sz, int nof
   q->symbol_sz = (uint32_t) symbol_sz;
   q->nof_symbols = SRSLTE_CP_NSYMB(cp);
   q->cp = cp;
-  q->nof_re = nof_prb * SRSLTE_NRE;
+  q->nof_re = (uint32_t) nof_prb * SRSLTE_NRE;
   q->nof_guards = ((symbol_sz - q->nof_re) / 2);
-  q->slot_sz = SRSLTE_SLOT_LEN(symbol_sz);
+  q->slot_sz = (uint32_t) SRSLTE_SLOT_LEN(symbol_sz);
+  q->sf_sz = (uint32_t) SRSLTE_SF_LEN(symbol_sz);
+
+#ifndef AVOID_GURU
+  cf_t *in_buffer = q->in_buffer;
+  cf_t *out_buffer = q->out_buffer;
+
+  int cp1 = SRSLTE_CP_ISNORM(cp)?SRSLTE_CP_LEN_NORM(0, symbol_sz):SRSLTE_CP_LEN_EXT(symbol_sz);
+  int cp2 = SRSLTE_CP_ISNORM(cp)?SRSLTE_CP_LEN_NORM(1, symbol_sz):SRSLTE_CP_LEN_EXT(symbol_sz);
+
+  srslte_dft_dir_t dir = q->fft_plan_sf[0].dir;
+
+  if (q->tmp) {
+    free(q->tmp);
+  }
+
+  q->tmp = srslte_vec_malloc(sizeof(cf_t) * q->sf_sz);
+  if (!q->tmp) {
+    perror("malloc");
+    return -1;
+  }
+  bzero(q->tmp, sizeof(cf_t) * q->sf_sz);
+
+  if (dir == SRSLTE_DFT_BACKWARD) {
+    bzero(in_buffer, sizeof(cf_t) * SRSLTE_SF_LEN_RE(nof_prb, cp));
+  }else {
+    bzero(in_buffer, sizeof(cf_t) * q->sf_sz);
+  }
+
+  for (int slot = 0; slot < 2; slot++) {
+    srslte_dft_plan_free(&q->fft_plan_sf[slot]);
+
+    if (dir == SRSLTE_DFT_FORWARD) {
+      if (srslte_dft_plan_guru_c(&q->fft_plan_sf[slot], symbol_sz, dir,
+                                 in_buffer + cp1 + q->slot_sz * slot,
+                                 q->tmp + q->nof_symbols * q->symbol_sz * slot,
+                                 1, 1, SRSLTE_CP_NSYMB(cp), symbol_sz + cp2, symbol_sz)) {
+        fprintf(stderr, "Error: Creating DFT plan (1)\n");
+        return -1;
+      }
+    } else {
+      if (srslte_dft_plan_guru_c(&q->fft_plan_sf[slot], symbol_sz, dir,
+                                 q->tmp + q->nof_symbols * q->symbol_sz * slot,
+                                 out_buffer + cp1 + q->slot_sz * slot,
+                                 1, 1, SRSLTE_CP_NSYMB(cp), symbol_sz, symbol_sz + cp2)) {
+        fprintf(stderr, "Error: Creating DFT plan (1)\n");
+        return -1;
+      }
+    }
+  }
+#endif /* AVOID_GURU */
+
 
   if (q->freq_shift) {
     srslte_ofdm_set_freq_shift(q, q->freq_shift_f);
@@ -118,6 +216,15 @@ int srslte_ofdm_replan_(srslte_ofdm_t *q, srslte_cp_t cp, int symbol_sz, int nof
 
 void srslte_ofdm_free_(srslte_ofdm_t *q) {
   srslte_dft_plan_free(&q->fft_plan);
+
+#ifndef AVOID_GURU
+  for (int slot = 0; slot < 2; slot++) {
+    if (q->fft_plan_sf[slot].init_size) {
+      srslte_dft_plan_free(&q->fft_plan_sf[slot]);
+    }
+  }
+#endif
+
   if (q->tmp) {
     free(q->tmp);
   }
@@ -127,28 +234,28 @@ void srslte_ofdm_free_(srslte_ofdm_t *q) {
   bzero(q, sizeof(srslte_ofdm_t));
 }
 
-int srslte_ofdm_rx_init(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t max_prb) {
+int srslte_ofdm_rx_init(srslte_ofdm_t *q, srslte_cp_t cp, cf_t *in_buffer, cf_t *out_buffer, uint32_t max_prb) {
   int symbol_sz = srslte_symbol_sz(max_prb);
   if (symbol_sz < 0) {
     fprintf(stderr, "Error: Invalid nof_prb=%d\n", max_prb);
     return -1;
   }
   q->max_prb = max_prb;
-  return srslte_ofdm_init_(q, cp, symbol_sz, max_prb, SRSLTE_DFT_FORWARD);
+  return srslte_ofdm_init_(q, cp, in_buffer, out_buffer, symbol_sz, max_prb, SRSLTE_DFT_FORWARD);
 }
 
-int srslte_ofdm_rx_init_mbsfn(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t nof_prb)
+int srslte_ofdm_rx_init_mbsfn(srslte_ofdm_t *q, srslte_cp_t cp, cf_t *in_buffer, cf_t *out_buffer, uint32_t nof_prb)
 {
   int symbol_sz = srslte_symbol_sz(nof_prb);
   if (symbol_sz < 0) {
     fprintf(stderr, "Error: Invalid nof_prb=%d\n", nof_prb);
     return -1;
   }
-  return srslte_ofdm_init_mbsfn_(q, cp, symbol_sz, nof_prb, SRSLTE_DFT_FORWARD, SRSLTE_SF_MBSFN);
+  return srslte_ofdm_init_mbsfn_(q, cp, in_buffer, out_buffer, symbol_sz, nof_prb, SRSLTE_DFT_FORWARD, SRSLTE_SF_MBSFN);
 }
 
 
-int srslte_ofdm_tx_init(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t max_prb) {
+int srslte_ofdm_tx_init(srslte_ofdm_t *q, srslte_cp_t cp, cf_t *in_buffer, cf_t *out_buffer, uint32_t max_prb) {
   uint32_t i;
   int ret;
   
@@ -158,7 +265,7 @@ int srslte_ofdm_tx_init(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t max_prb) {
     return -1;
   }
   q->max_prb = max_prb;
-  ret = srslte_ofdm_init_(q, cp, symbol_sz, max_prb, SRSLTE_DFT_BACKWARD); 
+  ret = srslte_ofdm_init_(q, cp, in_buffer, out_buffer, symbol_sz, max_prb, SRSLTE_DFT_BACKWARD);
 
   
   if (ret == SRSLTE_SUCCESS) {
@@ -173,7 +280,7 @@ int srslte_ofdm_tx_init(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t max_prb) {
   return ret;
 }
 
-int srslte_ofdm_tx_init_mbsfn(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t nof_prb)
+int srslte_ofdm_tx_init_mbsfn(srslte_ofdm_t *q, srslte_cp_t cp, cf_t *in_buffer, cf_t *out_buffer, uint32_t nof_prb)
 {
   uint32_t i;
   int ret;
@@ -184,7 +291,7 @@ int srslte_ofdm_tx_init_mbsfn(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t nof_prb
     return -1;
   }
 
-  ret = srslte_ofdm_init_mbsfn_(q, cp, symbol_sz, nof_prb, SRSLTE_DFT_BACKWARD, SRSLTE_SF_MBSFN);
+  ret = srslte_ofdm_init_mbsfn_(q, cp, in_buffer, out_buffer, symbol_sz, nof_prb, SRSLTE_DFT_BACKWARD, SRSLTE_SF_MBSFN);
   
   if (ret == SRSLTE_SUCCESS) {
     srslte_dft_plan_set_norm(&q->fft_plan, false);
@@ -207,7 +314,8 @@ int srslte_ofdm_rx_set_prb(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t nof_prb) {
     }
     return srslte_ofdm_replan_(q, cp, symbol_sz, nof_prb);
   } else {
-    fprintf(stderr, "OFDM: Error calling set_prb: nof_prb must be equal or lower initialized max_prb\n");
+    fprintf(stderr, "OFDM (Rx): Error calling set_prb: nof_prb (%d) must be equal or lower initialized max_prb (%d)\n",
+            nof_prb, q->max_prb);
     return -1;
   }
 }
@@ -234,7 +342,8 @@ int srslte_ofdm_tx_set_prb(srslte_ofdm_t *q, srslte_cp_t cp, uint32_t nof_prb) {
     }
     return ret;
   } else {
-    fprintf(stderr, "OFDM: Error calling set_prb: nof_prb must be equal or lower initialized max_prb\n");
+    fprintf(stderr, "OFDM (Tx): Error calling set_prb: nof_prb (%d) must be equal or lower initialized max_prb (%d)\n",
+            nof_prb, q->max_prb);
     return -1;
   }
 }
@@ -274,8 +383,12 @@ void srslte_ofdm_tx_free(srslte_ofdm_t *q) {
 /* Transforms input samples into output OFDM symbols.
  * Performs FFT on a each symbol and removes CP.
  */
-void srslte_ofdm_rx_slot(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
+void srslte_ofdm_rx_slot(srslte_ofdm_t *q, int slot_in_sf) {
+  cf_t *output = q->out_buffer + slot_in_sf * q->nof_re * q->nof_symbols;
+
+#ifdef AVOID_GURU
   uint32_t i;
+  cf_t *input = q->in_buffer + slot_in_sf * q->slot_sz;
   for (i=0;i<q->nof_symbols;i++) {
     input += SRSLTE_CP_ISNORM(q->cp)?SRSLTE_CP_LEN_NORM(i, q->symbol_sz):SRSLTE_CP_LEN_EXT(q->symbol_sz);
     srslte_dft_run_c(&q->fft_plan, input, q->tmp);
@@ -283,6 +396,25 @@ void srslte_ofdm_rx_slot(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
     input += q->symbol_sz;
     output += q->nof_re;
   }
+#else
+  float norm = 1.0f/sqrtf(q->fft_plan.size);
+  cf_t *tmp = q->tmp + slot_in_sf * q->symbol_sz * q->nof_symbols;
+  uint32_t dc = (q->fft_plan.dc) ? 1:0;
+
+  srslte_dft_run_guru_c(&q->fft_plan_sf[slot_in_sf]);
+
+  for (int i = 0; i < q->nof_symbols; i++) {
+    memcpy(output, tmp + q->symbol_sz - q->nof_re / 2, sizeof(cf_t) * q->nof_re / 2);
+    memcpy(output + q->nof_re / 2, &tmp[dc], sizeof(cf_t) * q->nof_re / 2);
+
+    if (q->fft_plan.norm) {
+      srslte_vec_sc_prod_cfc(output, norm, output, q->nof_re);
+    }
+
+    tmp += q->symbol_sz;
+    output += q->nof_re;
+  }
+#endif
 }
 
 void srslte_ofdm_rx_slot_mbsfn(srslte_ofdm_t *q, cf_t *input, cf_t *output)
@@ -314,29 +446,32 @@ void srslte_ofdm_rx_slot_zerocopy(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
   }  
 }
 
-void srslte_ofdm_rx_sf(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
-  uint32_t n; 
+void srslte_ofdm_rx_sf(srslte_ofdm_t *q) {
+  uint32_t n;
   if (q->freq_shift) {
-    srslte_vec_prod_ccc(input, q->shift_buffer, input, 2*q->slot_sz);
+    srslte_vec_prod_ccc(q->in_buffer, q->shift_buffer, q->in_buffer, 2*q->slot_sz);
   }
   if(!q->mbsfn_subframe){
     for (n=0;n<2;n++) {
-      srslte_ofdm_rx_slot(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]);
+      srslte_ofdm_rx_slot(q, n);
     }
   }
   else{
-    srslte_ofdm_rx_slot_mbsfn(q, &input[0*q->slot_sz], &output[0*q->nof_re*q->nof_symbols]);
-    srslte_ofdm_rx_slot(q, &input[1*q->slot_sz], &output[1*q->nof_re*q->nof_symbols]);
+    srslte_ofdm_rx_slot_mbsfn(q, &q->in_buffer[0*q->slot_sz], &q->out_buffer[0*q->nof_re*q->nof_symbols]);
+    srslte_ofdm_rx_slot(q, 1);
   }
 }
 
 /* Transforms input OFDM symbols into output samples.
  * Performs FFT on a each symbol and adds CP.
  */
-void srslte_ofdm_tx_slot(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
-  uint32_t i, cp_len;
-  for (i=0;i<q->nof_symbols;i++) {
-    cp_len = SRSLTE_CP_ISNORM(q->cp)?SRSLTE_CP_LEN_NORM(i, q->symbol_sz):SRSLTE_CP_LEN_EXT(q->symbol_sz);
+void srslte_ofdm_tx_slot(srslte_ofdm_t *q, int slot_in_sf) {
+  cf_t *input = q->in_buffer + slot_in_sf * q->nof_re * q->nof_symbols;
+  cf_t *output = q->out_buffer + slot_in_sf * q->slot_sz;
+
+#ifdef AVOID_GURU
+  for (int i=0;i<q->nof_symbols;i++) {
+    int cp_len = SRSLTE_CP_ISNORM(q->cp)?SRSLTE_CP_LEN_NORM(i, q->symbol_sz):SRSLTE_CP_LEN_EXT(q->symbol_sz);
     memcpy(&q->tmp[q->nof_guards], input, q->nof_re * sizeof(cf_t));
     srslte_dft_run_c(&q->fft_plan, q->tmp, &output[cp_len]);
     input += q->nof_re;
@@ -344,6 +479,60 @@ void srslte_ofdm_tx_slot(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
     memcpy(output, &output[q->symbol_sz], cp_len * sizeof(cf_t));
     output += q->symbol_sz + cp_len;
   }
+#else
+  float norm = 1.0f/sqrtf(q->symbol_sz);
+  cf_t *tmp = q->tmp + slot_in_sf * q->symbol_sz * q->nof_symbols;
+
+  bzero(tmp, q->slot_sz);
+  uint32_t dc = (q->fft_plan.dc) ? 1:0;
+
+  for (int i = 0; i < q->nof_symbols; i++) {
+    memcpy(&tmp[dc], &input[q->nof_re / 2], q->nof_re / 2 * sizeof(cf_t));
+    memcpy(&tmp[q->symbol_sz - q->nof_re / 2], &input[0], q->nof_re / 2 * sizeof(cf_t));
+
+    input += q->nof_re;
+    tmp += q->symbol_sz;
+  }
+
+  srslte_dft_run_guru_c(&q->fft_plan_sf[slot_in_sf]);
+
+  for (int i=0;i<q->nof_symbols;i++) {
+    int cp_len = SRSLTE_CP_ISNORM(q->cp) ? SRSLTE_CP_LEN_NORM(i, q->symbol_sz) : SRSLTE_CP_LEN_EXT(q->symbol_sz);
+
+    if (q->fft_plan.norm) {
+      srslte_vec_sc_prod_cfc(&output[cp_len], norm, &output[cp_len], q->symbol_sz);
+    }
+
+    /* add CP */
+    memcpy(output, &output[q->symbol_sz], cp_len * sizeof(cf_t));
+    output += q->symbol_sz + cp_len;
+  }
+#endif
+
+  /*input = q->in_buffer + slot_in_sf * q->nof_re * q->nof_symbols;
+  cf_t *output2 = srslte_vec_malloc(sizeof(cf_t) * q->slot_sz);
+  cf_t *o2 = output2;
+  bzero(q->tmp, sizeof(cf_t)*q->symbol_sz);
+  //bzero(output2, sizeof(cf_t)*q->slot_sz);
+  for (int i=0;i<q->nof_symbols;i++) {
+    int cp_len = SRSLTE_CP_ISNORM(q->cp)?SRSLTE_CP_LEN_NORM(i, q->symbol_sz):SRSLTE_CP_LEN_EXT(q->symbol_sz);
+    memcpy(&q->tmp[q->nof_guards], input, q->nof_re * sizeof(cf_t));
+    srslte_dft_run_c(&q->fft_plan, q->tmp, &o2[cp_len]);
+    input += q->nof_re;
+    memcpy(o2, &o2[q->symbol_sz], cp_len * sizeof(cf_t));
+    o2 += q->symbol_sz + cp_len;
+  }
+  cf_t *output1 = q->out_buffer + slot_in_sf * q->slot_sz;//srslte_vec_malloc(sizeof(cf_t) * q->slot_sz);
+
+  for (int i = 0; i < q->slot_sz; i++) {
+    float error = cabsf(output1[i] - output2[i])/cabsf(output2[i]);
+    cf_t k = output1[i]/output2[i];
+    if (error > 0.1) printf("%d/%05d error=%f output=%+f%+fi gold=%+f%+fi k=%+f%+fi\n", slot_in_sf, i, error,
+                            __real__ output1[i], __imag__ output1[i],
+                            __real__ output2[i], __imag__ output2[i],
+                            __real__ k, __imag__ k);
+  }
+  free(output2);/**/
 }
 
 void srslte_ofdm_tx_slot_mbsfn(srslte_ofdm_t *q, cf_t *input, cf_t *output)
@@ -369,20 +558,20 @@ void srslte_ofdm_set_normalize(srslte_ofdm_t *q, bool normalize_enable) {
   srslte_dft_plan_set_norm(&q->fft_plan, normalize_enable);
 }
 
-void srslte_ofdm_tx_sf(srslte_ofdm_t *q, cf_t *input, cf_t *output)
+void srslte_ofdm_tx_sf(srslte_ofdm_t *q)
 {
-  uint32_t n; 
+  uint32_t n;
   if(!q->mbsfn_subframe){
     for (n=0;n<2;n++) {
-      srslte_ofdm_tx_slot(q, &input[n*q->nof_re*q->nof_symbols], &output[n*q->slot_sz]);
+      srslte_ofdm_tx_slot(q, n);
     }
   }
   else{
-     srslte_ofdm_tx_slot_mbsfn(q, &input[0*q->nof_re*q->nof_symbols], &output[0*q->slot_sz]);
-     srslte_ofdm_tx_slot(q, &input[1*q->nof_re*q->nof_symbols], &output[1*q->slot_sz]);
+     srslte_ofdm_tx_slot_mbsfn(q, &q->in_buffer[0*q->nof_re*q->nof_symbols], &q->out_buffer[0*q->slot_sz]);
+     srslte_ofdm_tx_slot(q, 1);
   }
   if (q->freq_shift) {
-    srslte_vec_prod_ccc(output, q->shift_buffer, output, 2*q->slot_sz);
+    srslte_vec_prod_ccc(q->out_buffer, q->shift_buffer, q->out_buffer, 2*q->slot_sz);
   }
 }
 
diff --git a/lib/src/phy/dft/test/ofdm_test.c b/lib/src/phy/dft/test/ofdm_test.c
index 11aac7f4e..e77fcd39e 100644
--- a/lib/src/phy/dft/test/ofdm_test.c
+++ b/lib/src/phy/dft/test/ofdm_test.c
@@ -35,16 +35,28 @@
 
 int nof_prb = -1;
 srslte_cp_t cp = SRSLTE_CP_NORM;
+int nof_repetitions = 128;
+
+static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
+  if (ts_end->tv_usec > ts_start->tv_usec) {
+    return ((double) ts_end->tv_sec - (double) ts_start->tv_sec) * 1000000 +
+           (double) ts_end->tv_usec - (double) ts_start->tv_usec;
+  } else {
+    return ((double) ts_end->tv_sec - (double) ts_start->tv_sec - 1) * 1000000 +
+           ((double) ts_end->tv_usec + 1000000) - (double) ts_start->tv_usec;
+  }
+}
 
 void usage(char *prog) {
   printf("Usage: %s\n", prog);
   printf("\t-n nof_prb [Default All]\n");
   printf("\t-e extended cyclic prefix [Default Normal]\n");
+  printf("\t-r nof_repetitions [Default %d]\n", nof_repetitions);
 }
 
 void parse_args(int argc, char **argv) {
   int opt;
-  while ((opt = getopt(argc, argv, "ne")) != -1) {
+  while ((opt = getopt(argc, argv, "ner")) != -1) {
     switch (opt) {
     case 'n':
       nof_prb = atoi(argv[optind]);
@@ -52,6 +64,9 @@ void parse_args(int argc, char **argv) {
     case 'e':
       cp = SRSLTE_CP_EXT;
       break;
+    case 'r':
+      nof_repetitions = atoi(argv[optind]);
+      break;
     default:
       usage(argv[0]);
       exit(-1);
@@ -61,6 +76,7 @@ void parse_args(int argc, char **argv) {
 
 
 int main(int argc, char **argv) {
+  struct timeval start, end;
   srslte_ofdm_t fft, ifft;
   cf_t *input, *outfft, *outifft;
   float mse;
@@ -81,48 +97,65 @@ int main(int argc, char **argv) {
 
     printf("Running test for %d PRB, %d RE... ", n_prb, n_re);fflush(stdout);
 
-    input = malloc(sizeof(cf_t) * n_re);
+    input = srslte_vec_malloc(sizeof(cf_t) * n_re * 2);
     if (!input) {
       perror("malloc");
       exit(-1);
     }
-    outfft = malloc(sizeof(cf_t) * SRSLTE_SLOT_LEN(srslte_symbol_sz(n_prb)));
+    outfft = srslte_vec_malloc(sizeof(cf_t) * n_re * 2);
     if (!outfft) {
       perror("malloc");
       exit(-1);
     }
-    outifft = malloc(sizeof(cf_t) * n_re);
+    outifft = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SLOT_LEN(srslte_symbol_sz(n_prb)) * 2);
     if (!outifft) {
       perror("malloc");
       exit(-1);
     }
+    bzero(outifft, sizeof(cf_t) * SRSLTE_SLOT_LEN(srslte_symbol_sz(n_prb)) * 2);
 
-    if (srslte_ofdm_rx_init(&fft, cp, n_prb)) {
+    if (srslte_ofdm_rx_init(&fft, cp, outifft, outfft, n_prb)) {
       fprintf(stderr, "Error initializing FFT\n");
       exit(-1);
     }
-    srslte_dft_plan_set_norm(&fft.fft_plan, true);
+    srslte_ofdm_set_normalize(&fft, true);
 
-    if (srslte_ofdm_tx_init(&ifft, cp, n_prb)) {
+    if (srslte_ofdm_tx_init(&ifft, cp, input, outifft, n_prb)) {
       fprintf(stderr, "Error initializing iFFT\n");
       exit(-1);
     }
-    srslte_dft_plan_set_norm(&ifft.fft_plan, true);
+    srslte_ofdm_set_normalize(&ifft, true);
 
     for (i=0;i<n_re;i++) {
-      input[i] = 100 * ((float) rand()/RAND_MAX + (float) I*rand()/RAND_MAX);
+      input[i] = 100 * ((float) rand() / (float) RAND_MAX + I * ((float) rand() / (float) RAND_MAX));
+      //input[i] = 100;
     }
 
-    srslte_ofdm_tx_slot(&ifft, input, outfft);
-    srslte_ofdm_rx_slot(&fft, outfft, outifft);
+  gettimeofday(&start, NULL);
+  for (int i = 0; i < nof_repetitions; i++) {
+      srslte_ofdm_tx_slot(&ifft, 0);
+  }
+  gettimeofday(&end, NULL);\
+  printf(" Tx@%.1fMsps", (float)(SRSLTE_SLOT_LEN(srslte_symbol_sz(n_prb))*nof_repetitions)/elapsed_us(&start, &end));
+
+  gettimeofday(&start, NULL);
+  for (int i = 0; i < nof_repetitions; i++) {
+    srslte_ofdm_rx_slot(&fft, 0);
+  }
+  gettimeofday(&end, NULL);\
+  printf(" Rx@%.1fMsps", (float)(SRSLTE_SLOT_LEN(srslte_symbol_sz(n_prb))*nof_repetitions)/elapsed_us(&start, &end));
 
     /* compute MSE */
-
-    mse = 0;
+    mse = 0.0f;
     for (i=0;i<n_re;i++) {
-      mse += cabsf(input[i] - outifft[i]);
+      cf_t error = input[i] - outfft[i];
+      mse += (__real__ error * __real__ error + __imag__ error * __imag__ error)/cabsf(input[i]);
+      if (mse > 1.0f) printf("%04d. %+.1f%+.1fi Vs. %+.1f%+.1f %+.1f%+.1f (mse=%f)\n", i, __real__ input[i], __imag__ input[i], __real__ outifft[i], __imag__ outifft[i], __real__ outfft[i], __imag__ outfft[i], mse);
     }
-    printf("MSE=%f\n", mse);
+    /*for (i=0;i<n_re;i++) {
+      mse += cabsf(input[i] - outfft[i]);
+    }*/
+    printf(" MSE=%.6f\n", mse);
 
     if (mse >= 0.07) {
       printf("MSE too large\n");
diff --git a/lib/src/phy/enb/enb_dl.c b/lib/src/phy/enb/enb_dl.c
index 2ba179399..54a63bc2a 100644
--- a/lib/src/phy/enb/enb_dl.c
+++ b/lib/src/phy/enb/enb_dl.c
@@ -41,7 +41,7 @@
 
 #define SRSLTE_ENB_RF_AMP 0.1
 
-int srslte_enb_dl_init(srslte_enb_dl_t *q, uint32_t max_prb)
+int srslte_enb_dl_init(srslte_enb_dl_t *q, cf_t *out_buffer[SRSLTE_MAX_PORTS], uint32_t max_prb)
 {
   int ret = SRSLTE_ERROR_INVALID_INPUTS; 
   
@@ -53,13 +53,26 @@ int srslte_enb_dl_init(srslte_enb_dl_t *q, uint32_t max_prb)
     
     q->cfi  = 3;
     q->tx_amp = SRSLTE_ENB_RF_AMP;
-    
-    if (srslte_ofdm_tx_init(&q->ifft, SRSLTE_CP_NORM, max_prb)) {
-      fprintf(stderr, "Error initiating FFT\n");
-      goto clean_exit;
+
+    for (int i=0;i<SRSLTE_MAX_PORTS;i++) {
+      q->sf_symbols[i] = srslte_vec_malloc(SRSLTE_SF_LEN_RE(max_prb, SRSLTE_CP_NORM) * sizeof(cf_t));
+      if (!q->sf_symbols[i]) {
+        perror("malloc");
+        goto clean_exit;
+      }
+      q->slot1_symbols[i] = &q->sf_symbols[i][SRSLTE_SLOT_LEN_RE(max_prb, SRSLTE_CP_NORM)];
     }
 
-    srslte_ofdm_set_normalize(&q->ifft, true);
+    for (int i = 0; i < SRSLTE_MAX_PORTS; i++) {
+      if (srslte_ofdm_tx_init(&q->ifft[i], SRSLTE_CP_NORM, q->sf_symbols[i], out_buffer[i], max_prb)) {
+        fprintf(stderr, "Error initiating FFT (%d)\n", i);
+        goto clean_exit;
+      }
+    }
+
+    for (int i = 0; i < q->cell.nof_ports; i++) {
+      srslte_ofdm_set_normalize(&q->ifft[i], true);
+    }
 
     if (srslte_pbch_init(&q->pbch)) {
       fprintf(stderr, "Error creating PBCH object\n");
@@ -89,15 +102,6 @@ int srslte_enb_dl_init(srslte_enb_dl_t *q, uint32_t max_prb)
       goto clean_exit;
     }
     
-    for (int i=0;i<SRSLTE_MAX_PORTS;i++) {
-      q->sf_symbols[i] = srslte_vec_malloc(SRSLTE_SF_LEN_RE(max_prb, SRSLTE_CP_NORM) * sizeof(cf_t));
-      if (!q->sf_symbols[i]) {
-        perror("malloc");
-        goto clean_exit; 
-      }
-      q->slot1_symbols[i] = &q->sf_symbols[i][SRSLTE_SLOT_LEN_RE(max_prb, SRSLTE_CP_NORM)];
-    }
-    
     ret = SRSLTE_SUCCESS;
     
   } else {
@@ -114,7 +118,9 @@ clean_exit:
 void srslte_enb_dl_free(srslte_enb_dl_t *q)
 {
   if (q) {
-    srslte_ofdm_tx_free(&q->ifft);
+    for (int i = 0; i < SRSLTE_MAX_PORTS; i++) {
+      srslte_ofdm_tx_free(&q->ifft[i]);
+    }
     srslte_regs_free(&q->regs);
     srslte_pbch_free(&q->pbch);
     srslte_pcfich_free(&q->pcfich);
@@ -152,9 +158,11 @@ int srslte_enb_dl_set_cell(srslte_enb_dl_t *q, srslte_cell_t cell)
         fprintf(stderr, "Error resizing REGs\n");
         return SRSLTE_ERROR;
       }
-      if (srslte_ofdm_rx_set_prb(&q->ifft, q->cell.cp, q->cell.nof_prb)) {
-        fprintf(stderr, "Error initiating FFT\n");
-        return SRSLTE_ERROR;
+      for (int i = 0; i < q->cell.nof_ports; i++) {
+        if (srslte_ofdm_tx_set_prb(&q->ifft[i], q->cell.cp, q->cell.nof_prb)) {
+          fprintf(stderr, "Error re-planning iFFT (%d)\n", i);
+          return SRSLTE_ERROR;
+        }
       }
       if (srslte_pbch_set_cell(&q->pbch, q->cell)) {
         fprintf(stderr, "Error creating PBCH object\n");
@@ -264,14 +272,15 @@ void srslte_enb_dl_put_base(srslte_enb_dl_t *q, uint32_t tti)
   
 }
 
-void srslte_enb_dl_gen_signal(srslte_enb_dl_t *q, cf_t *signal_buffer) 
+void srslte_enb_dl_gen_signal(srslte_enb_dl_t *q)
 {
-  
-  srslte_ofdm_tx_sf(&q->ifft, q->sf_symbols[0], signal_buffer);
-     
   // TODO: PAPR control
   float norm_factor = (float) sqrt(q->cell.nof_prb)/15;
-  srslte_vec_sc_prod_cfc(signal_buffer, q->tx_amp*norm_factor, signal_buffer, SRSLTE_SF_LEN_PRB(q->cell.nof_prb));
+
+  for (int i = 0; i < q->cell.nof_ports; i++) {
+    srslte_ofdm_tx_sf(&q->ifft[i]);
+    srslte_vec_sc_prod_cfc(q->ifft[i].out_buffer, q->tx_amp*norm_factor, q->ifft[i].out_buffer, (uint32_t) SRSLTE_SF_LEN_PRB(q->cell.nof_prb));
+  }
 }
 
 int srslte_enb_dl_add_rnti(srslte_enb_dl_t *q, uint16_t rnti)
diff --git a/lib/src/phy/enb/enb_ul.c b/lib/src/phy/enb/enb_ul.c
index db05d44ea..f94eb0277 100644
--- a/lib/src/phy/enb/enb_ul.c
+++ b/lib/src/phy/enb/enb_ul.c
@@ -40,6 +40,7 @@
 #define MAX_CANDIDATES  16
 
 int srslte_enb_ul_init(srslte_enb_ul_t *q,
+                       cf_t *in_buffer,
                        uint32_t max_prb)
 {
   int ret = SRSLTE_ERROR_INVALID_INPUTS; 
@@ -55,8 +56,20 @@ int srslte_enb_ul_init(srslte_enb_ul_t *q,
       perror("malloc");
       goto clean_exit;
     }
-    
-    if (srslte_ofdm_rx_init(&q->fft, SRSLTE_CP_NORM, max_prb)) {
+
+    q->sf_symbols = srslte_vec_malloc(SRSLTE_SF_LEN_RE(max_prb, SRSLTE_CP_NORM) * sizeof(cf_t));
+    if (!q->sf_symbols) {
+      perror("malloc");
+      goto clean_exit;
+    }
+
+    q->ce = srslte_vec_malloc(SRSLTE_SF_LEN_RE(max_prb, SRSLTE_CP_NORM) * sizeof(cf_t));
+    if (!q->ce) {
+      perror("malloc");
+      goto clean_exit;
+    }
+
+    if (srslte_ofdm_rx_init(&q->fft, SRSLTE_CP_NORM, in_buffer, q->sf_symbols, max_prb)) {
       fprintf(stderr, "Error initiating FFT\n");
       goto clean_exit;
     }
@@ -80,18 +93,6 @@ int srslte_enb_ul_init(srslte_enb_ul_t *q,
       goto clean_exit; 
     }
 
-    q->sf_symbols = srslte_vec_malloc(SRSLTE_SF_LEN_RE(max_prb, SRSLTE_CP_NORM) * sizeof(cf_t));
-    if (!q->sf_symbols) {
-      perror("malloc");
-      goto clean_exit; 
-    }
-    
-    q->ce = srslte_vec_malloc(SRSLTE_SF_LEN_RE(max_prb, SRSLTE_CP_NORM) * sizeof(cf_t));
-    if (!q->ce) {
-      perror("malloc");
-      goto clean_exit; 
-    }
-        
     ret = SRSLTE_SUCCESS;
     
   } else {
@@ -254,7 +255,7 @@ int srslte_enb_ul_cfg_ue(srslte_enb_ul_t *q, uint16_t rnti,
 
 void srslte_enb_ul_fft(srslte_enb_ul_t *q, cf_t *signal_buffer) 
 {
-  srslte_ofdm_rx_sf(&q->fft, signal_buffer, q->sf_symbols);
+  srslte_ofdm_rx_sf(&q->fft);
 }
 
 int get_pucch(srslte_enb_ul_t *q, uint16_t rnti, 
diff --git a/lib/src/phy/phch/test/pbch_file_test.c b/lib/src/phy/phch/test/pbch_file_test.c
index 734640d55..2ca12e4c9 100644
--- a/lib/src/phy/phch/test/pbch_file_test.c
+++ b/lib/src/phy/phch/test/pbch_file_test.c
@@ -140,7 +140,7 @@ int base_init() {
     return -1;
   }
 
-  if (srslte_ofdm_init_(&fft, cell.cp, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
+  if (srslte_ofdm_init_(&fft, cell.cp, input_buffer, fft_buffer, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
     fprintf(stderr, "Error initializing FFT\n");
     return -1;
   }
@@ -203,7 +203,7 @@ int main(int argc, char **argv) {
 
     if (nread > 0) {
       // process 1st subframe only
-      srslte_ofdm_rx_sf(&fft, input_buffer, fft_buffer);
+      srslte_ofdm_rx_sf(&fft);
 
       /* Get channel estimates for each port */
       srslte_chest_dl_estimate(&chest, fft_buffer, ce, 0);
diff --git a/lib/src/phy/phch/test/pcfich_file_test.c b/lib/src/phy/phch/test/pcfich_file_test.c
index dfb8d72e3..e92d6c7ba 100644
--- a/lib/src/phy/phch/test/pcfich_file_test.c
+++ b/lib/src/phy/phch/test/pcfich_file_test.c
@@ -120,15 +120,15 @@ int base_init() {
     fmatlab = NULL;
   }
 
-  flen = SRSLTE_SF_LEN(srslte_symbol_sz(cell.nof_prb));
+  flen = SRSLTE_SF_LEN(srslte_symbol_sz_power2(cell.nof_prb));
 
-  input_buffer = malloc(flen * sizeof(cf_t));
+  input_buffer = srslte_vec_malloc(flen * sizeof(cf_t));
   if (!input_buffer) {
     perror("malloc");
     exit(-1);
   }
 
-  fft_buffer = malloc(SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp) * sizeof(cf_t));
+  fft_buffer = srslte_vec_malloc(SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp) * sizeof(cf_t));
   if (!fft_buffer) {
     perror("malloc");
     return -1;
@@ -151,7 +151,7 @@ int base_init() {
     return -1;
   }
 
-  if (srslte_ofdm_init_(&fft, cell.cp, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
+  if (srslte_ofdm_init_(&fft, cell.cp, input_buffer, fft_buffer, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
     fprintf(stderr, "Error initializing FFT\n");
     return -1;
   }
@@ -215,7 +215,7 @@ int main(int argc, char **argv) {
 
   n = srslte_filesource_read(&fsrc, input_buffer, flen);
 
-  srslte_ofdm_rx_sf(&fft, input_buffer, fft_buffer);
+  srslte_ofdm_rx_sf(&fft);
 
   if (fmatlab) {
     fprintf(fmatlab, "infft=");
diff --git a/lib/src/phy/phch/test/pdcch_file_test.c b/lib/src/phy/phch/test/pdcch_file_test.c
index d4ceed4b6..5482d9f98 100644
--- a/lib/src/phy/phch/test/pdcch_file_test.c
+++ b/lib/src/phy/phch/test/pdcch_file_test.c
@@ -126,7 +126,7 @@ int base_init() {
     exit(-1);
   }
 
-  flen = 2 * (SRSLTE_SLOT_LEN(srslte_symbol_sz(cell.nof_prb)));
+  flen = 2 * (SRSLTE_SLOT_LEN(srslte_symbol_sz_power2(cell.nof_prb)));
 
   input_buffer = malloc(flen * sizeof(cf_t));
   if (!input_buffer) {
@@ -157,7 +157,7 @@ int base_init() {
     return -1;
   }
 
-  if (srslte_ofdm_init_(&fft, cell.cp, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
+  if (srslte_ofdm_init_(&fft, cell.cp, input_buffer, fft_buffer, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
     fprintf(stderr, "Error initializing FFT\n");
     return -1;
   }
@@ -231,7 +231,7 @@ int main(int argc, char **argv) {
 
     INFO("Reading %d samples sub-frame %d\n", flen, frame_cnt);
 
-    srslte_ofdm_rx_sf(&fft, input_buffer, fft_buffer);
+    srslte_ofdm_rx_sf(&fft);
 
     /* Get channel estimates for each port */
     srslte_chest_dl_estimate(&chest, fft_buffer, ce, frame_cnt %10);
diff --git a/lib/src/phy/phch/test/pdsch_pdcch_file_test.c b/lib/src/phy/phch/test/pdsch_pdcch_file_test.c
index 90c0e1c17..0faf7eca1 100644
--- a/lib/src/phy/phch/test/pdsch_pdcch_file_test.c
+++ b/lib/src/phy/phch/test/pdsch_pdcch_file_test.c
@@ -129,7 +129,7 @@ int base_init() {
     exit(-1);
   }
 
-  flen = 2 * (SRSLTE_SLOT_LEN(srslte_symbol_sz(cell.nof_prb)));
+  flen = SRSLTE_SF_LEN(srslte_symbol_sz_power2(cell.nof_prb));
 
   input_buffer[0] = malloc(flen * sizeof(cf_t));
   if (!input_buffer[0]) {
@@ -137,7 +137,7 @@ int base_init() {
     exit(-1);
   }
 
-  if (srslte_ue_dl_init(&ue_dl, cell.nof_prb, 1)) {
+  if (srslte_ue_dl_init(&ue_dl, input_buffer, cell.nof_prb, 1)) {
     fprintf(stderr, "Error initializing UE DL\n");
     return -1;
   }
diff --git a/lib/src/phy/phch/test/phich_file_test.c b/lib/src/phy/phch/test/phich_file_test.c
index d7078f933..65f7ce9c0 100644
--- a/lib/src/phy/phch/test/phich_file_test.c
+++ b/lib/src/phy/phch/test/phich_file_test.c
@@ -144,7 +144,7 @@ int base_init() {
     fmatlab = NULL;
   }
 
-  flen = SRSLTE_SF_LEN(srslte_symbol_sz(cell.nof_prb));
+  flen = SRSLTE_SF_LEN(srslte_symbol_sz_power2(cell.nof_prb));
 
   input_buffer = malloc(flen * sizeof(cf_t));
   if (!input_buffer) {
@@ -175,7 +175,7 @@ int base_init() {
     return -1;
   }
 
-  if (srslte_ofdm_init_(&fft, cell.cp, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
+  if (srslte_ofdm_init_(&fft, cell.cp, input_buffer, fft_buffer, srslte_symbol_sz_power2(cell.nof_prb), cell.nof_prb, SRSLTE_DFT_FORWARD)) {
     fprintf(stderr, "Error initializing FFT\n");
     return -1;
   }
@@ -242,7 +242,7 @@ int main(int argc, char **argv) {
 
   n = srslte_filesource_read(&fsrc, input_buffer, flen);
 
-  srslte_ofdm_rx_sf(&fft, input_buffer, fft_buffer);
+  srslte_ofdm_rx_sf(&fft);
 
   if (fmatlab) {
     fprintf(fmatlab, "infft=");
diff --git a/lib/src/phy/phch/test/pmch_file_test.c b/lib/src/phy/phch/test/pmch_file_test.c
index ac66072fa..6586b2ee9 100644
--- a/lib/src/phy/phch/test/pmch_file_test.c
+++ b/lib/src/phy/phch/test/pmch_file_test.c
@@ -140,7 +140,7 @@ int base_init() {
     exit(-1);
   }
 
-  if (srslte_ue_dl_init(&ue_dl, cell.nof_prb, 1)) {
+  if (srslte_ue_dl_init(&ue_dl, input_buffer, cell.nof_prb, 1)) {
     fprintf(stderr, "Error initializing UE DL\n");
     return -1;
   }
diff --git a/lib/src/phy/phch/test/pmch_test.c b/lib/src/phy/phch/test/pmch_test.c
index a187ca2bb..a9c29ef64 100644
--- a/lib/src/phy/phch/test/pmch_test.c
+++ b/lib/src/phy/phch/test/pmch_test.c
@@ -139,7 +139,7 @@ cf_t *tx_slot_symbols[SRSLTE_MAX_PORTS];
 cf_t *rx_slot_symbols[SRSLTE_MAX_PORTS];
 srslte_pmch_t pmch_tx, pmch_rx;
 srslte_pdsch_cfg_t  pmch_cfg;
-srslte_ofdm_t ifft_mbsfn, fft_mbsfn; 
+srslte_ofdm_t ifft_mbsfn[SRSLTE_MAX_PORTS], fft_mbsfn[SRSLTE_MAX_PORTS];
 
 int main(int argc, char **argv) {
   uint32_t i, j, k;
@@ -169,10 +169,10 @@ int main(int argc, char **argv) {
     grant.tb_en[1] = false;
     grant.nof_tb = 1;
     grant.mcs[0].idx = mcs_idx;
-   
+
     grant.nof_prb = cell.nof_prb;
     grant.sf_type = SRSLTE_SF_MBSFN;
-    
+
     srslte_dl_fill_ra_mcs(&grant.mcs[0], cell.nof_prb);
     grant.Qm[0] = srslte_mod_bits_x_symbol(grant.mcs[0].mod);
     for(int i = 0; i < 2; i++){
@@ -181,41 +181,6 @@ int main(int argc, char **argv) {
       }
     }
 
-  
-
-#ifdef DO_OFDM
-
- if (srslte_ofdm_tx_init_mbsfn(&ifft_mbsfn, SRSLTE_CP_EXT, cell.nof_prb)) {
-    fprintf(stderr, "Error creating iFFT object\n");
-    exit(-1);
-  }
-  if (srslte_ofdm_rx_init_mbsfn(&fft_mbsfn, SRSLTE_CP_EXT, cell.nof_prb)) {
-    fprintf(stderr, "Error creating iFFT object\n");
-    exit(-1);
-  }
-  
-  srslte_ofdm_set_non_mbsfn_region(&ifft_mbsfn, non_mbsfn_region);
-  srslte_ofdm_set_non_mbsfn_region(&fft_mbsfn, non_mbsfn_region);
-  srslte_ofdm_set_normalize(&ifft_mbsfn, true);
-  srslte_ofdm_set_normalize(&fft_mbsfn, true);
-  
-
-  for (i = 0; i < cell.nof_ports; i++) {
-    tx_sf_symbols[i] = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN_PRB(cell.nof_prb));
-  }
-
-  for (i = 0; i < nof_rx_antennas; i++) {
-    rx_sf_symbols[i] = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN_PRB(cell.nof_prb));
-  }
-#endif /* DO_OFDM */
-
-  /* Configure PDSCH */
-  
-  if (srslte_pmch_cfg(&pmch_cfg, cell, &grant, cfi, subframe)) {
-    fprintf(stderr, "Error configuring PMCH\n");
-    exit(-1);
-  }
-  
   /* init memory */
   for (i=0;i<SRSLTE_MAX_PORTS;i++) {
     for (j = 0; j < SRSLTE_MAX_PORTS; j++) {
@@ -235,6 +200,25 @@ int main(int argc, char **argv) {
     }
   }
 
+  for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
+    softbuffers_tx[i] = calloc(sizeof(srslte_softbuffer_tx_t), 1);
+    if (!softbuffers_tx[i]) {
+      fprintf(stderr, "Error allocating TX soft buffer\n");
+    }
+
+    if (srslte_softbuffer_tx_init(softbuffers_tx[i], cell.nof_prb)) {
+      fprintf(stderr, "Error initiating TX soft buffer\n");
+      goto quit;
+    }
+  }
+
+  for (i = 0; i < cell.nof_ports; i++) {
+    tx_slot_symbols[i] = calloc(SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp), sizeof(cf_t));
+    if (!tx_slot_symbols[i]) {
+      perror("srslte_vec_malloc");
+      goto quit;
+    }
+  }
 
   for (int i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
     if (grant.tb_en[i]) {
@@ -256,7 +240,7 @@ int main(int argc, char **argv) {
   }
 
 
-  
+
   for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
     softbuffers_rx[i] = calloc(sizeof(srslte_softbuffer_rx_t), 1);
     if (!softbuffers_rx[i]) {
@@ -269,6 +253,44 @@ int main(int argc, char **argv) {
       goto quit;
     }
   }
+
+#ifdef DO_OFDM
+
+  for (i = 0; i < cell.nof_ports; i++) {
+    tx_sf_symbols[i] = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN_PRB(cell.nof_prb));
+
+    if (srslte_ofdm_tx_init_mbsfn(&ifft_mbsfn[i], SRSLTE_CP_EXT, tx_slot_symbols[i], tx_sf_symbols[i], cell.nof_prb)) {
+      fprintf(stderr, "Error creating iFFT object\n");
+      exit(-1);
+    }
+
+    srslte_ofdm_set_non_mbsfn_region(&ifft_mbsfn[i], non_mbsfn_region);
+    srslte_ofdm_set_normalize(&ifft_mbsfn[i], true);
+  }
+
+  for (i = 0; i < nof_rx_antennas; i++) {
+    rx_sf_symbols[i] = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN_PRB(cell.nof_prb));
+
+    if (srslte_ofdm_rx_init_mbsfn(&fft_mbsfn[i], SRSLTE_CP_EXT, rx_sf_symbols[i], rx_slot_symbols[i], cell.nof_prb)) {
+      fprintf(stderr, "Error creating iFFT object\n");
+      exit(-1);
+    }
+
+    srslte_ofdm_set_non_mbsfn_region(&fft_mbsfn[i], non_mbsfn_region);
+    srslte_ofdm_set_normalize(&fft_mbsfn[i], true);
+  }
+
+
+
+
+#endif /* DO_OFDM */
+
+  /* Configure PDSCH */
+  
+  if (srslte_pmch_cfg(&pmch_cfg, cell, &grant, cfi, subframe)) {
+    fprintf(stderr, "Error configuring PMCH\n");
+    exit(-1);
+  }
   
   if (srslte_pmch_cfg(&pmch_cfg, cell, &grant, cfi, subframe)) {
       fprintf(stderr, "Error configuring PMCH\n");
@@ -312,25 +334,7 @@ int main(int argc, char **argv) {
   srslte_pmch_set_area_id(&pmch_rx, mbsfn_area_id);
 
 
-  for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
-    softbuffers_tx[i] = calloc(sizeof(srslte_softbuffer_tx_t), 1);
-    if (!softbuffers_tx[i]) {
-      fprintf(stderr, "Error allocating TX soft buffer\n");
-    }
 
-    if (srslte_softbuffer_tx_init(softbuffers_tx[i], cell.nof_prb)) {
-      fprintf(stderr, "Error initiating TX soft buffer\n");
-      goto quit;
-    }
-  }
-
-  for (i = 0; i < cell.nof_ports; i++) {
-    tx_slot_symbols[i] = calloc(SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp), sizeof(cf_t));
-    if (!tx_slot_symbols[i]) {
-      perror("srslte_vec_malloc");
-      goto quit;
-    }
-  }
 
   for (int tb = 0; tb < SRSLTE_MAX_CODEWORDS; tb++) {
     if (grant.tb_en[tb]) {
@@ -353,7 +357,7 @@ int main(int argc, char **argv) {
 #ifdef DO_OFDM
   for (i = 0; i < cell.nof_ports; i++) {
     /* For each Tx antenna modulate OFDM */
-    srslte_ofdm_tx_sf(&ifft_mbsfn, tx_slot_symbols[i], tx_sf_symbols[i]);
+    srslte_ofdm_tx_sf(&ifft_mbsfn[i]);
   }
 
 
@@ -387,7 +391,7 @@ int main(int argc, char **argv) {
 #ifdef DO_OFDM
     /* For each Rx antenna demodulate OFDM */
     for (i = 0; i < nof_rx_antennas; i++) {
-      srslte_ofdm_rx_sf(&fft_mbsfn, tx_sf_symbols[i], rx_slot_symbols[i]);
+      srslte_ofdm_rx_sf(&fft_mbsfn[i]);
     }
 #endif
   for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
diff --git a/lib/src/phy/sync/test/sync_test.c b/lib/src/phy/sync/test/sync_test.c
index 8715b316e..1e3951884 100644
--- a/lib/src/phy/sync/test/sync_test.c
+++ b/lib/src/phy/sync/test/sync_test.c
@@ -108,8 +108,8 @@ int main(int argc, char **argv) {
     perror("malloc");
     exit(-1);
   }
-  
-  if (srslte_ofdm_tx_init(&ifft, cp, nof_prb)) {
+
+  if (srslte_ofdm_tx_init(&ifft, cp, buffer, fft_buffer, nof_prb)) {
     fprintf(stderr, "Error creating iFFT object\n");
     exit(-1);
   }
@@ -150,8 +150,14 @@ int main(int argc, char **argv) {
 
       /* Transform to OFDM symbols */
       memset(fft_buffer, 0, sizeof(cf_t) * FLEN);
-      srslte_ofdm_tx_sf(&ifft, buffer, &fft_buffer[offset]);
+      srslte_ofdm_tx_sf(&ifft);
       
+      /* Apply sample offset */
+      for (int i = 0; i < FLEN; i++) {
+        fft_buffer[FLEN - i - 1 + offset] = fft_buffer[FLEN - i - 1];
+      }
+      bzero(fft_buffer, sizeof(cf_t) * offset);
+
       if (srslte_sync_find(&syncobj, fft_buffer, 0, &find_idx) < 0) {
         fprintf(stderr, "Error running srslte_sync_find\n");
         exit(-1);
diff --git a/lib/src/phy/ue/ue_dl.c b/lib/src/phy/ue/ue_dl.c
index c4e2d3f6c..2ff9254ee 100644
--- a/lib/src/phy/ue/ue_dl.c
+++ b/lib/src/phy/ue/ue_dl.c
@@ -53,6 +53,7 @@ static srslte_dci_format_t common_formats[] = {SRSLTE_DCI_FORMAT1A,SRSLTE_DCI_FO
 const uint32_t nof_common_formats = 2; 
 
 int srslte_ue_dl_init(srslte_ue_dl_t *q,
+                      cf_t *in_buffer[SRSLTE_MAX_PORTS],
                       uint32_t max_prb,
                       uint32_t nof_rx_antennas)
 {
@@ -73,12 +74,35 @@ int srslte_ue_dl_init(srslte_ue_dl_t *q,
     q->sample_offset = 0; 
     q->nof_rx_antennas = nof_rx_antennas;
 
-    if (srslte_ofdm_rx_init(&q->fft, SRSLTE_CP_NORM, max_prb)) {
-      fprintf(stderr, "Error initiating FFT\n");
-      goto clean_exit;
+    for (int j = 0; j < SRSLTE_MAX_PORTS; j++) {
+      q->sf_symbols_m[j] = srslte_vec_malloc(MAX_SFLEN_RE * sizeof(cf_t));
+      if (!q->sf_symbols_m[j]) {
+        perror("malloc");
+        goto clean_exit;
+      }
+      for (uint32_t i=0;i<SRSLTE_MAX_PORTS;i++) {
+        q->ce_m[i][j] = srslte_vec_malloc(MAX_SFLEN_RE * sizeof(cf_t));
+        if (!q->ce_m[i][j]) {
+          perror("malloc");
+          goto clean_exit;
+        }
+        bzero(q->ce_m[i][j], MAX_SFLEN_RE * sizeof(cf_t));
+      }
     }
-        
-    if (srslte_ofdm_rx_init_mbsfn(&q->fft_mbsfn, SRSLTE_CP_EXT, max_prb)) {
+
+    q->sf_symbols = q->sf_symbols_m[0];
+    for (int i=0;i<SRSLTE_MAX_PORTS;i++) {
+      q->ce[i] = q->ce_m[i][0];
+    }
+
+    for (int i = 0; i < nof_rx_antennas; i++) {
+      if (srslte_ofdm_rx_init(&q->fft[i], SRSLTE_CP_NORM, in_buffer[i], q->sf_symbols_m[i], max_prb)) {
+        fprintf(stderr, "Error initiating FFT\n");
+        goto clean_exit;
+      }
+    }
+
+    if (srslte_ofdm_rx_init_mbsfn(&q->fft_mbsfn, SRSLTE_CP_EXT, in_buffer[0], q->sf_symbols_m[0], max_prb)) {
       fprintf(stderr, "Error initiating FFT for MBSFN subframes \n");
       goto clean_exit;
     }
@@ -127,28 +151,7 @@ int srslte_ue_dl_init(srslte_ue_dl_t *q,
       fprintf(stderr, "Error initiating SFO correct\n");
       goto clean_exit;
     }
-    srslte_cfo_set_tol(&q->sfo_correct, 1e-5f/q->fft.symbol_sz);
-    
-    for (int j = 0; j < SRSLTE_MAX_PORTS; j++) {
-      q->sf_symbols_m[j] = srslte_vec_malloc(MAX_SFLEN_RE * sizeof(cf_t));
-      if (!q->sf_symbols_m[j]) {
-        perror("malloc");
-        goto clean_exit; 
-      }
-      for (uint32_t i=0;i<SRSLTE_MAX_PORTS;i++) {
-        q->ce_m[i][j] = srslte_vec_malloc(MAX_SFLEN_RE * sizeof(cf_t));
-        if (!q->ce_m[i][j]) {
-          perror("malloc");
-          goto clean_exit; 
-        }
-        bzero(q->ce_m[i][j], MAX_SFLEN_RE * sizeof(cf_t));
-      }
-    }
-    
-    q->sf_symbols = q->sf_symbols_m[0];
-    for (int i=0;i<SRSLTE_MAX_PORTS;i++) {
-      q->ce[i] = q->ce_m[i][0];
-    }
+    srslte_cfo_set_tol(&q->sfo_correct, 1e-5f/q->fft[0].symbol_sz);
     
     ret = SRSLTE_SUCCESS;
   } else {
@@ -164,7 +167,9 @@ clean_exit:
 
 void srslte_ue_dl_free(srslte_ue_dl_t *q) {
   if (q) {
-    srslte_ofdm_rx_free(&q->fft);
+    for (int port = 0; port < SRSLTE_MAX_PORTS; port++) {
+      srslte_ofdm_rx_free(&q->fft[port]);
+    }
     srslte_ofdm_rx_free(&q->fft_mbsfn);
     srslte_chest_dl_free(&q->chest);
     srslte_regs_free(&q->regs);
@@ -219,10 +224,12 @@ int srslte_ue_dl_set_cell(srslte_ue_dl_t *q, srslte_cell_t cell)
         fprintf(stderr, "Error resizing SFO correct\n");
         return SRSLTE_ERROR;
       }
-      srslte_cfo_set_tol(&q->sfo_correct, 1e-5/q->fft.symbol_sz);
-      if (srslte_ofdm_rx_set_prb(&q->fft, q->cell.cp, q->cell.nof_prb)) {
-        fprintf(stderr, "Error resizing FFT\n");
-        return SRSLTE_ERROR;
+      srslte_cfo_set_tol(&q->sfo_correct, 1e-5f/q->fft[0].symbol_sz);
+      for (int port = 0; port < q->nof_rx_antennas; port++) {
+        if (srslte_ofdm_rx_set_prb(&q->fft[port], q->cell.cp, q->cell.nof_prb)) {
+          fprintf(stderr, "Error resizing FFT\n");
+          return SRSLTE_ERROR;
+        }
       }
       if (srslte_chest_dl_set_cell(&q->chest, q->cell)) {
         fprintf(stderr, "Error resizing channel estimator\n");
@@ -339,9 +346,9 @@ int srslte_ue_dl_decode_fft_estimate_mbsfn(srslte_ue_dl_t *q, cf_t *input[SRSLTE
     /* Run FFT for all subframe data */
     for (int j=0;j<q->nof_rx_antennas;j++) {
       if(sf_type == SRSLTE_SF_MBSFN ) {
-        srslte_ofdm_rx_sf(&q->fft_mbsfn, input[j], q->sf_symbols_m[j]);
+        srslte_ofdm_rx_sf(&q->fft_mbsfn);
       }else{
-        srslte_ofdm_rx_sf(&q->fft, input[j], q->sf_symbols_m[j]);
+        srslte_ofdm_rx_sf(&q->fft[j]);
       }
 
       /* Correct SFO multiplying by complex exponential in the time domain */
@@ -351,7 +358,7 @@ int srslte_ue_dl_decode_fft_estimate_mbsfn(srslte_ue_dl_t *q, cf_t *input[SRSLTE
           srslte_cfo_correct(&q->sfo_correct, 
                           &q->sf_symbols_m[j][i*q->cell.nof_prb*SRSLTE_NRE], 
                           &q->sf_symbols_m[j][i*q->cell.nof_prb*SRSLTE_NRE], 
-                          q->sample_offset / q->fft.symbol_sz);
+                          q->sample_offset / q->fft[j].symbol_sz);
         }
       }
     }
diff --git a/lib/src/phy/ue/ue_mib.c b/lib/src/phy/ue/ue_mib.c
index 46a470ab8..c003a2ba3 100644
--- a/lib/src/phy/ue/ue_mib.c
+++ b/lib/src/phy/ue/ue_mib.c
@@ -35,7 +35,8 @@
 #include "srslte/phy/utils/debug.h"
 #include "srslte/phy/utils/vector.h"
 
-int srslte_ue_mib_init(srslte_ue_mib_t * q, 
+int srslte_ue_mib_init(srslte_ue_mib_t * q,
+                       cf_t *in_buffer[SRSLTE_MAX_PORTS],
                        uint32_t max_prb)
 {
   int ret = SRSLTE_ERROR_INVALID_INPUTS;
@@ -65,7 +66,7 @@ int srslte_ue_mib_init(srslte_ue_mib_t * q,
       }
     }
 
-    if (srslte_ofdm_rx_init(&q->fft, SRSLTE_CP_NORM, max_prb)) {
+    if (srslte_ofdm_rx_init(&q->fft, SRSLTE_CP_NORM, in_buffer[0], q->sf_symbols, max_prb)) {
       fprintf(stderr, "Error initializing FFT\n");
       goto clean_exit;
     }
@@ -143,14 +144,14 @@ void srslte_ue_mib_reset(srslte_ue_mib_t * q)
   srslte_pbch_decode_reset(&q->pbch);
 }
 
-int srslte_ue_mib_decode(srslte_ue_mib_t * q, cf_t *input, 
+int srslte_ue_mib_decode(srslte_ue_mib_t * q,
                   uint8_t bch_payload[SRSLTE_BCH_PAYLOAD_LEN], uint32_t *nof_tx_ports, int *sfn_offset)
 {
   int ret = SRSLTE_SUCCESS;
   cf_t *ce_slot1[SRSLTE_MAX_PORTS]; 
 
   /* Run FFT for the slot symbols */
-  srslte_ofdm_rx_sf(&q->fft, input, q->sf_symbols);
+  srslte_ofdm_rx_sf(&q->fft);
             
   /* Get channel estimates of sf idx #0 for each port */
   ret = srslte_chest_dl_estimate(&q->chest, q->sf_symbols, q->ce, 0);
@@ -198,7 +199,7 @@ int srslte_ue_mib_sync_init_multi(srslte_ue_mib_sync_t *q,
   }
   q->nof_rx_antennas = nof_rx_antennas;
   
-  if (srslte_ue_mib_init(&q->ue_mib, SRSLTE_UE_MIB_NOF_PRB)) {
+  if (srslte_ue_mib_init(&q->ue_mib, q->sf_buffer, SRSLTE_UE_MIB_NOF_PRB)) {
     fprintf(stderr, "Error initiating ue_mib\n");
     return SRSLTE_ERROR;
   }
@@ -274,7 +275,7 @@ int srslte_ue_mib_sync_decode(srslte_ue_mib_sync_t * q,
         return -1;
       } else if (srslte_ue_sync_get_sfidx(&q->ue_sync) == 0) {
         if (ret == 1) {
-          mib_ret = srslte_ue_mib_decode(&q->ue_mib, q->sf_buffer[0], bch_payload, nof_tx_ports, sfn_offset);                    
+          mib_ret = srslte_ue_mib_decode(&q->ue_mib, bch_payload, nof_tx_ports, sfn_offset);
         } else {
           DEBUG("Resetting PBCH decoder after %d frames\n", q->ue_mib.frame_cnt);
           srslte_ue_mib_reset(&q->ue_mib);
diff --git a/lib/src/phy/ue/ue_ul.c b/lib/src/phy/ue/ue_ul.c
index 37dfecd93..853937f7c 100644
--- a/lib/src/phy/ue/ue_ul.c
+++ b/lib/src/phy/ue/ue_ul.c
@@ -41,6 +41,7 @@
 #define DEFAULT_CFO_TOL   50.0 // Hz
 
 int srslte_ue_ul_init(srslte_ue_ul_t *q,
+                      cf_t *out_buffer,
                       uint32_t max_prb)
 {
   int ret = SRSLTE_ERROR_INVALID_INPUTS; 
@@ -50,8 +51,14 @@ int srslte_ue_ul_init(srslte_ue_ul_t *q,
     ret = SRSLTE_ERROR;
     
     bzero(q, sizeof(srslte_ue_ul_t));
-    
-    if (srslte_ofdm_tx_init(&q->fft, SRSLTE_CP_NORM, max_prb)) {
+
+    q->sf_symbols = srslte_vec_malloc(SRSLTE_SF_LEN_PRB(max_prb) * sizeof(cf_t));
+    if (!q->sf_symbols) {
+      perror("malloc");
+      goto clean_exit;
+    }
+
+    if (srslte_ofdm_tx_init(&q->fft, SRSLTE_CP_NORM, q->sf_symbols, out_buffer, max_prb)) {
       fprintf(stderr, "Error initiating FFT\n");
       goto clean_exit;
     }
@@ -83,11 +90,6 @@ int srslte_ue_ul_init(srslte_ue_ul_t *q,
       fprintf(stderr, "Error initiating srslte_refsignal_ul\n");
       goto clean_exit;
     }
-    q->sf_symbols = srslte_vec_malloc(SRSLTE_SF_LEN_PRB(max_prb) * sizeof(cf_t));
-    if (!q->sf_symbols) {
-      perror("malloc");
-      goto clean_exit; 
-    }
     q->refsignal = srslte_vec_malloc(2 * SRSLTE_NRE * max_prb * sizeof(cf_t));
     if (!q->refsignal) {
       perror("malloc");
@@ -347,7 +349,7 @@ int srslte_ue_ul_pucch_encode(srslte_ue_ul_t *q, srslte_uci_data_t uci_data,
     
     q->last_pucch_format = format; 
 
-    srslte_ofdm_tx_sf(&q->fft, q->sf_symbols, output_signal);
+    srslte_ofdm_tx_sf(&q->fft);
     
     if (q->cfo_en) {
       srslte_cfo_correct(&q->cfo, output_signal, output_signal, q->current_cfo / srslte_symbol_sz(q->cell.nof_prb));
@@ -417,7 +419,7 @@ int srslte_ue_ul_srs_encode(srslte_ue_ul_t *q, uint32_t tti, cf_t *output_signal
       }
     }
     
-    srslte_ofdm_tx_sf(&q->fft, q->sf_symbols, output_signal);
+    srslte_ofdm_tx_sf(&q->fft);
     
     if (q->cfo_en) {
       srslte_cfo_correct(&q->cfo, output_signal, output_signal, q->current_cfo / srslte_symbol_sz(q->cell.nof_prb));
@@ -486,7 +488,7 @@ int srslte_ue_ul_pusch_encode_rnti_softbuffer(srslte_ue_ul_t *q,
       }
     }
     
-    srslte_ofdm_tx_sf(&q->fft, q->sf_symbols, output_signal);
+    srslte_ofdm_tx_sf(&q->fft);
     
     if (q->cfo_en) {
       srslte_cfo_correct(&q->cfo, output_signal, output_signal, q->current_cfo / srslte_symbol_sz(q->cell.nof_prb));
diff --git a/srsenb/hdr/phy/phch_worker.h b/srsenb/hdr/phy/phch_worker.h
index 906e8b9d0..48878a879 100644
--- a/srsenb/hdr/phy/phch_worker.h
+++ b/srsenb/hdr/phy/phch_worker.h
@@ -88,7 +88,7 @@ private:
   bool           running;
 
   cf_t          *signal_buffer_rx; 
-  cf_t          *signal_buffer_tx; 
+  cf_t          *signal_buffer_tx[SRSLTE_MAX_PORTS];
   uint32_t       tti_rx, tti_tx, tti_sched_ul, sf_rx, sf_tx, sf_sched_ul, tx_mutex_cnt;
 
   srslte_enb_dl_t enb_dl;
diff --git a/srsenb/src/phy/phch_worker.cc b/srsenb/src/phy/phch_worker.cc
index 3a1fb8ca4..8a6173347 100644
--- a/srsenb/src/phy/phch_worker.cc
+++ b/srsenb/src/phy/phch_worker.cc
@@ -93,12 +93,13 @@ void phch_worker::init(phch_common* phy_, srslte::log *log_h_)
     fprintf(stderr, "Error allocating memory\n");
     return; 
   }
-  signal_buffer_tx = (cf_t*) srslte_vec_malloc(2*SRSLTE_SF_LEN_PRB(phy->cell.nof_prb)*sizeof(cf_t));
-  if (!signal_buffer_tx) {
+  bzero(&signal_buffer_tx, sizeof(cf_t *) * SRSLTE_MAX_PORTS);
+  signal_buffer_tx[0] = (cf_t*) srslte_vec_malloc(2*SRSLTE_SF_LEN_PRB(phy->cell.nof_prb)*sizeof(cf_t));
+  if (!signal_buffer_tx[0]) {
     fprintf(stderr, "Error allocating memory\n");
     return; 
   }
-  if (srslte_enb_dl_init(&enb_dl, phy->cell.nof_prb)) {
+  if (srslte_enb_dl_init(&enb_dl, signal_buffer_tx, phy->cell.nof_prb)) {
     fprintf(stderr, "Error initiating ENB DL\n");
     return;
   }
@@ -106,7 +107,7 @@ void phch_worker::init(phch_common* phy_, srslte::log *log_h_)
     fprintf(stderr, "Error initiating ENB DL\n");
     return;
   }
-  if (srslte_enb_ul_init(&enb_ul, phy->cell.nof_prb)) {
+  if (srslte_enb_ul_init(&enb_ul, signal_buffer_rx, phy->cell.nof_prb)) {
     fprintf(stderr, "Error initiating ENB UL\n");
     return;
   }
@@ -156,8 +157,10 @@ void phch_worker::stop()
   if (signal_buffer_rx) {
     free(signal_buffer_rx);
   }
-  if (signal_buffer_tx) {
-    free(signal_buffer_tx);
+  for (int i = 0; i < SRSLTE_MAX_PORTS; i++) {
+    if (signal_buffer_tx[i]) {
+      free(signal_buffer_tx[i]);
+    }
   }
   pthread_mutex_unlock(&mutex);
   pthread_mutex_destroy(&mutex);
@@ -336,9 +339,9 @@ void phch_worker::work_imp()
   }
   
   // Generate signal and transmit
-  srslte_enb_dl_gen_signal(&enb_dl, signal_buffer_tx);  
+  srslte_enb_dl_gen_signal(&enb_dl);
   Debug("Sending to radio\n");
-  phy->worker_end(tx_mutex_cnt, signal_buffer_tx, SRSLTE_SF_LEN_PRB(phy->cell.nof_prb), tx_time);
+  phy->worker_end(tx_mutex_cnt, signal_buffer_tx[0], SRSLTE_SF_LEN_PRB(phy->cell.nof_prb), tx_time);
 
 #ifdef DEBUG_WRITE_FILE
   fwrite(signal_buffer_tx, SRSLTE_SF_LEN_PRB(phy->cell.nof_prb)*sizeof(cf_t), 1, f);
diff --git a/srsue/src/phy/phch_recv.cc b/srsue/src/phy/phch_recv.cc
index c14141257..041418b5a 100644
--- a/srsue/src/phy/phch_recv.cc
+++ b/srsue/src/phy/phch_recv.cc
@@ -101,13 +101,13 @@ void phch_recv::  init(srslte::radio_multi *_radio_handler, mac_interface_phy *_
   if (do_agc) {
     srslte_ue_sync_start_agc(&cs.ue_sync, callback_set_rx_gain, last_gain);
   }
-
-  if (srslte_ue_dl_init(&ue_dl_measure, SRSLTE_MAX_PRB, nof_rx_antennas)) {
+  
+  if (srslte_ue_dl_init(&ue_dl_measure, sf_buffer, SRSLTE_MAX_PRB, nof_rx_antennas)) {
     Error("SYNC:  Initiating ue_dl_measure\n");
     return;
   }
 
-  if (srslte_ue_mib_init(&ue_mib, SRSLTE_MAX_PRB)) {
+  if (srslte_ue_mib_init(&ue_mib, sf_buffer, SRSLTE_MAX_PRB)) {
     Error("SYNC:  Initiating UE MIB decoder\n");
     return;
   }
@@ -374,7 +374,7 @@ int phch_recv::cell_sync_sfn(void) {
       int sfn_offset = 0;
       Info("SYNC:  Trying to decode MIB... SNR=%.1f dB\n",
            10*log10(srslte_chest_dl_get_snr(&ue_mib.chest)));
-      int n = srslte_ue_mib_decode(&ue_mib, sf_buffer[0], bch_payload, NULL, &sfn_offset);
+      int n = srslte_ue_mib_decode(&ue_mib, bch_payload, NULL, &sfn_offset);
       if (n < 0) {
         Error("SYNC:  Error decoding MIB while synchronising SFN");
         return -1;
diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc
index c0fec2ba2..ee1011aea 100644
--- a/srsue/src/phy/phch_worker.cc
+++ b/srsue/src/phy/phch_worker.cc
@@ -118,12 +118,12 @@ bool phch_worker::init(uint32_t max_prb, srslte::log *log_h)
     }
   }
 
-  if (srslte_ue_dl_init(&ue_dl, max_prb, phy->args->nof_rx_ant)) {
+  if (srslte_ue_dl_init(&ue_dl, signal_buffer, max_prb, phy->args->nof_rx_ant)) {
     Error("Initiating UE DL\n");
     return false;
   }
 
-  if (srslte_ue_ul_init(&ue_ul, max_prb)) {
+  if (srslte_ue_ul_init(&ue_ul, signal_buffer[0], max_prb)) {
     Error("Initiating UE UL\n");
     return false;
   }

From b0157cb4f870b330c6b2daa737c9335a43aaa17e Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 20 Oct 2017 10:40:59 -0400
Subject: [PATCH 52/55] Removed error when can't save fft wisdom (prints in
 mkl)

---
 lib/src/phy/dft/dft_fftw.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/src/phy/dft/dft_fftw.c b/lib/src/phy/dft/dft_fftw.c
index a06dd397f..9d6898117 100644
--- a/lib/src/phy/dft/dft_fftw.c
+++ b/lib/src/phy/dft/dft_fftw.c
@@ -56,9 +56,7 @@ void srslte_dft_load() {
 
 void srslte_dft_exit() {
 #ifdef FFTW_WISDOM_FILE
-  if (!fftwf_export_wisdom_to_filename(FFTW_WISDOM_FILE)) {
-    fprintf(stderr, "Error saving FFTW wisdom to file %s\n", FFTW_WISDOM_FILE);
-  }
+  fftwf_export_wisdom_to_filename(FFTW_WISDOM_FILE);
 #endif
 }
 

From c1b296eb2cb516329b6f66a48549c81cfff02ff0 Mon Sep 17 00:00:00 2001
From: Xavier Arteaga <xavier@softwareradiosystems.com>
Date: Fri, 20 Oct 2017 18:17:07 +0200
Subject: [PATCH 53/55] SSE optimization for srslte_bit_interleave_w_offset

---
 lib/src/phy/utils/bit.c | 126 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/lib/src/phy/utils/bit.c b/lib/src/phy/utils/bit.c
index 9ef53c35a..b1ae383a6 100644
--- a/lib/src/phy/utils/bit.c
+++ b/lib/src/phy/utils/bit.c
@@ -31,6 +31,12 @@
 #include <string.h>
 #include <stddef.h>
 
+#ifdef LV_HAVE_SSE
+
+#include <immintrin.h>
+
+#endif /* LV_HAVE_SSE */
+
 #include "srslte/phy/utils/bit.h"
 
 void srslte_bit_interleave(uint8_t *input, uint8_t *output, uint16_t *interleaver, uint32_t nof_bits) {
@@ -53,6 +59,125 @@ void srslte_bit_interleave_w_offset(uint8_t *input, uint8_t *output, uint16_t *i
     }
     w_offset_p=8-w_offset;
   }
+#ifdef LV_HAVE_SSE
+  __m64 m64mask = _mm_setr_pi8((uint8_t) 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+  __m128i m128mask = _mm_set1_epi64(m64mask);
+
+  union {
+    uint8_t v[8];
+    __m64 m64;
+  } a, b, c;
+
+  union {
+    __m128i m128;
+    uint16_t u16[8];
+    uint8_t u8[16];
+    struct {
+      __m64 reg_a;
+      __m64 reg_b;
+    } m64;
+    struct {
+      uint16_t i0, i1, i2, i3, i4, i5, i6, i7;
+    } v;
+  } ipx, epx, ipx2, epx2, b128, a128, c128;
+
+  uint32_t i = st;
+  for (; i < (nof_bits / 8 - 1); i += 2) {
+    ipx.m128 = _mm_loadu_si128((__m128i *) (interleaver + (i * 8) - w_offset_p));
+    epx.m128 = _mm_shuffle_epi8(ipx.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                                                       0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E));
+    ipx2.m128 = _mm_loadu_si128((__m128i *) (interleaver + ((i + 1) * 8) - w_offset_p));
+    epx2.m128 = _mm_shuffle_epi8(ipx2.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                                                         0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E));
+
+    epx.m64.reg_b = epx2.m64.reg_a;
+
+    b128.m128 = _mm_and_si128(epx.m128, _mm_set1_epi8(0x7));
+    b128.m128 = _mm_shuffle_epi8(m128mask, b128.m128);
+
+    ipx.m128 = _mm_srli_epi16(ipx.m128, 3);
+    ipx2.m128 = _mm_srli_epi16(ipx2.m128, 3);
+
+    a128.m128 = _mm_set_epi8(input[ipx2.v.i0],
+                             input[ipx2.v.i1],
+                             input[ipx2.v.i2],
+                             input[ipx2.v.i3],
+                             input[ipx2.v.i4],
+                             input[ipx2.v.i5],
+                             input[ipx2.v.i6],
+                             input[ipx2.v.i7],
+                             input[ipx.v.i0],
+                             input[ipx.v.i1],
+                             input[ipx.v.i2],
+                             input[ipx.v.i3],
+                             input[ipx.v.i4],
+                             input[ipx.v.i5],
+                             input[ipx.v.i6],
+                             input[ipx.v.i7]);
+
+    c128.m128 = _mm_cmpeq_epi8(_mm_and_si128(a128.m128, b128.m128), b128.m128);
+    uint16_t o = (uint16_t) _mm_movemask_epi8(c128.m128);
+    *((uint16_t *) (output + i)) = o;
+  }
+
+  for (; i < nof_bits / 8; i++) {
+    ipx.m128 = _mm_loadu_si128((__m128i *) (interleaver + i * 8 - w_offset_p));
+    epx.m128 = _mm_shuffle_epi8(ipx.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                                                       0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E));
+    b.m64 = _mm_and_si64(epx.m64.reg_a, _mm_set1_pi8(0x7));
+    b.m64 = _mm_shuffle_pi8(m64mask, b.m64);
+
+    ipx.m128 = _mm_srli_epi16(ipx.m128, 3);
+
+    a.m64 = _mm_set_pi8(input[ipx.v.i0],
+                        input[ipx.v.i1],
+                        input[ipx.v.i2],
+                        input[ipx.v.i3],
+                        input[ipx.v.i4],
+                        input[ipx.v.i5],
+                        input[ipx.v.i6],
+                        input[ipx.v.i7]);
+
+    c.m64 = _mm_cmpeq_pi8(_mm_and_si64(a.m64, b.m64), b.m64);
+    output[i] = (uint8_t) _mm_movemask_pi8(c.m64);
+  }
+
+#if 0
+  /* THIS PIECE OF CODE IS FOR CHECKING SIMD BEHAVIOUR. DO NOT ENABLE. */
+  uint8_t *output2 = malloc(nof_bits/8);
+  for (i=st;i<nof_bits/8;i++) {
+
+    uint16_t i_p0 = interleaver[i*8+0-w_offset_p];
+    uint16_t i_p1 = interleaver[i*8+1-w_offset_p];
+    uint16_t i_p2 = interleaver[i*8+2-w_offset_p];
+    uint16_t i_p3 = interleaver[i*8+3-w_offset_p];
+    uint16_t i_p4 = interleaver[i*8+4-w_offset_p];
+    uint16_t i_p5 = interleaver[i*8+5-w_offset_p];
+    uint16_t i_p6 = interleaver[i*8+6-w_offset_p];
+    uint16_t i_p7 = interleaver[i*8+7-w_offset_p];
+
+    uint8_t out0  = (input[i_p0/8] & mask[i_p0%8])?mask[0]:(uint8_t)0;
+    uint8_t out1  = (input[i_p1/8] & mask[i_p1%8])?mask[1]:(uint8_t)0;
+    uint8_t out2  = (input[i_p2/8] & mask[i_p2%8])?mask[2]:(uint8_t)0;
+    uint8_t out3  = (input[i_p3/8] & mask[i_p3%8])?mask[3]:(uint8_t)0;
+    uint8_t out4  = (input[i_p4/8] & mask[i_p4%8])?mask[4]:(uint8_t)0;
+    uint8_t out5  = (input[i_p5/8] & mask[i_p5%8])?mask[5]:(uint8_t)0;
+    uint8_t out6  = (input[i_p6/8] & mask[i_p6%8])?mask[6]:(uint8_t)0;
+    uint8_t out7  = (input[i_p7/8] & mask[i_p7%8])?mask[7]:(uint8_t)0;
+
+    output2[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
+  }
+
+  for(i = st; i < nof_bits/8; i++) {
+    if (output[i] != output2[i]) {
+      printf("%05d/%05d %02X %02X\n", i, nof_bits/8, output[i], output2[i]);
+    }
+    //output[i] = output2[i];
+  }
+  free(output2);
+#endif
+
+#else /* LV_HAVE_SSE */
   for (uint32_t i=st;i<nof_bits/8;i++) {
     
     uint16_t i_p0 = interleaver[i*8+0-w_offset_p];
@@ -75,6 +200,7 @@ void srslte_bit_interleave_w_offset(uint8_t *input, uint8_t *output, uint16_t *i
     
     output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; 
   }
+#endif /* LV_HAVE_SSE */
   for (uint32_t j=0;j<nof_bits%8;j++) {
     uint16_t i_p = interleaver[(nof_bits/8)*8+j-w_offset];          
     if (input[i_p/8] & mask[i_p%8]) {

From 85571c47cd4f65b816496fa335ba582671f0db6d Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 20 Oct 2017 12:34:34 -0400
Subject: [PATCH 54/55] Added option to ue.conf to configure CFO EMA (set
 default to 0.4)

---
 lib/include/srslte/interfaces/ue_interfaces.h | 3 ++-
 lib/include/srslte/phy/ue/ue_sync.h           | 5 ++++-
 lib/src/phy/ue/ue_sync.c                      | 5 +++++
 srsue/src/main.cc                             | 5 +++++
 srsue/src/phy/phch_recv.cc                    | 1 +
 srsue/ue.conf.example                         | 9 ++++++++-
 6 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/include/srslte/interfaces/ue_interfaces.h b/lib/include/srslte/interfaces/ue_interfaces.h
index 8561ba55c..88dfd285c 100644
--- a/lib/include/srslte/interfaces/ue_interfaces.h
+++ b/lib/include/srslte/interfaces/ue_interfaces.h
@@ -445,7 +445,8 @@ typedef struct {
   float snr_ema_coeff; 
   std::string snr_estim_alg; 
   bool cfo_integer_enabled; 
-  float cfo_correct_tol_hz; 
+  float cfo_correct_tol_hz;
+  float cfo_ema;
   int time_correct_period; 
   bool sfo_correct_disable; 
   std::string sss_algorithm; 
diff --git a/lib/include/srslte/phy/ue/ue_sync.h b/lib/include/srslte/phy/ue/ue_sync.h
index bd280acd0..e5877dd23 100644
--- a/lib/include/srslte/phy/ue/ue_sync.h
+++ b/lib/include/srslte/phy/ue/ue_sync.h
@@ -185,7 +185,10 @@ SRSLTE_API void srslte_ue_sync_set_cfo_tol(srslte_ue_sync_t *q,
                                            float tol);
 
 SRSLTE_API void srslte_ue_sync_set_cfo(srslte_ue_sync_t *q, 
-                                       float cfo); 
+                                       float cfo);
+
+SRSLTE_API void srslte_ue_sync_set_cfo_ema(srslte_ue_sync_t *q,
+                                           float ema);
 
 SRSLTE_API void srslte_ue_sync_cfo_i_detec_en(srslte_ue_sync_t *q, 
                                               bool enable); 
diff --git a/lib/src/phy/ue/ue_sync.c b/lib/src/phy/ue/ue_sync.c
index f121c5bac..e9c02ca8b 100644
--- a/lib/src/phy/ue/ue_sync.c
+++ b/lib/src/phy/ue/ue_sync.c
@@ -359,6 +359,11 @@ uint32_t srslte_ue_sync_peak_idx(srslte_ue_sync_t *q) {
   return q->peak_idx;
 }
 
+void srslte_ue_sync_set_cfo_ema(srslte_ue_sync_t *q, float ema) {
+  srslte_sync_set_cfo_ema_alpha(&q->sfind, ema);
+  srslte_sync_set_cfo_ema_alpha(&q->strack, ema);
+}
+
 srslte_ue_sync_state_t srslte_ue_sync_get_state(srslte_ue_sync_t *q) {
   return q->state;
 }
diff --git a/srsue/src/main.cc b/srsue/src/main.cc
index 1b146a9f3..52d4b13c9 100644
--- a/srsue/src/main.cc
+++ b/srsue/src/main.cc
@@ -202,6 +202,11 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
      bpo::value<float>(&args->expert.phy.cfo_correct_tol_hz)->default_value(50.0),
      "Tolerance (in Hz) for digial CFO compensation.")
 
+    ("expert.cfo_ema",
+     bpo::value<float>(&args->expert.phy.cfo_ema)->default_value(0.4),
+     "CFO Exponential Moving Average coefficient. Lower makes it more robust to noise "
+     "but vulnerable to periodic interruptions due to VCO corrections.")
+
     ("expert.time_correct_period",
      bpo::value<int>(&args->expert.phy.time_correct_period)->default_value(5),
      "Period for sampling time offset correction.")
diff --git a/srsue/src/phy/phch_recv.cc b/srsue/src/phy/phch_recv.cc
index cf8bd6786..8a69a7f76 100644
--- a/srsue/src/phy/phch_recv.cc
+++ b/srsue/src/phy/phch_recv.cc
@@ -209,6 +209,7 @@ void phch_recv::set_ue_sync_opts(srslte_ue_sync_t *q) {
     srslte_ue_sync_cfo_i_detec_en(q, true);
   }
 
+  srslte_ue_sync_set_cfo_ema(q, worker_com->args->cfo_ema);
   srslte_ue_sync_set_cfo_tol(q, worker_com->args->cfo_correct_tol_hz);
 
   int time_correct_period = worker_com->args->time_correct_period;
diff --git a/srsue/ue.conf.example b/srsue/ue.conf.example
index d0dc67952..5b3559ae9 100644
--- a/srsue/ue.conf.example
+++ b/srsue/ue.conf.example
@@ -98,7 +98,10 @@ enable = false
 #####################################################################
 # Expert configuration options
 #
+# ue_category:          Sets UE category (range 1-5). Default: 4
 # ip_netmask:           Netmask of the tun_srsue device. Default: 255.255.255.0
+# rssi_sensor_enabled:  Enable or disable RF frontend RSSI sensor. Required for RSRP metrics but
+#                       can cause UHD instability for long-duration testing. Default true.
 # ue_category:          Sets UE category (range 1-5). Default: 4
 #
 # prach_gain:           PRACH gain (dB). If defined, forces a gain for the tranmsission of PRACH only., 
@@ -116,7 +119,9 @@ enable = false
 # nof_phy_threads:      Selects the number of PHY threads (maximum 4, minimum 1, default 2)
 # equalizer_mode:       Selects equalizer mode. Valid modes are: "mmse", "zf" or any 
 #                       non-negative real number to indicate a regularized zf coefficient.
-#                       Default is MMSE. 
+#                       Default is MMSE.
+# cfo_ema:              CFO Exponential Moving Average coefficient. Lower makes it more robust to noise
+#                       but vulnerable to periodic interruptions due to VCO corrections.
 # cfo_integer_enabled:  Enables integer CFO estimation and correction. This needs improvement
 #                       and may lead to incorrect synchronization. Use with caution. 
 # cfo_correct_tol_hz:   Tolerance (in Hz) for digial CFO compensation. Lower tolerance means that 
@@ -140,6 +145,7 @@ enable = false
 #####################################################################
 [expert]
 #ip_netmask          = 255.255.255.0
+#rssi_sensor_enabled = false
 #ue_category         = 4
 #prach_gain          = 30
 #cqi_max             = 15
@@ -150,6 +156,7 @@ enable = false
 #attach_enable_64qam = false
 #nof_phy_threads     = 2
 #equalizer_mode      = mmse
+#cfo_ema             = 0.4
 #cfo_integer_enabled = false
 #cfo_correct_tol_hz  = 50
 #time_correct_period = 5

From 3cbe526cbc7fbaad9cb1febfcba4fa2c47783671 Mon Sep 17 00:00:00 2001
From: Ismael Gomez <ismagom@gmail.com>
Date: Fri, 20 Oct 2017 12:43:04 -0400
Subject: [PATCH 55/55] Revert "Disable RSSI sensor by default"

This reverts commit c14393b24f874b4e12d3bbfb883133fedda4f6f1.
---
 srsue/src/main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srsue/src/main.cc b/srsue/src/main.cc
index 52d4b13c9..b933b66f3 100644
--- a/srsue/src/main.cc
+++ b/srsue/src/main.cc
@@ -155,7 +155,7 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
      "Pregenerate uplink signals after attach. Improves CPU performance.")
 
     ("expert.rssi_sensor_enabled",
-     bpo::value<bool>(&args->expert.phy.rssi_sensor_enabled)->default_value(false),
+     bpo::value<bool>(&args->expert.phy.rssi_sensor_enabled)->default_value(true),
      "Enable or disable RF frontend RSSI sensor. In some USRP devices can cause segmentation fault")
 
     ("expert.prach_gain",