From 8c3a0153b9dcdaf066de06a812028622f91dfa7e Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Wed, 21 Nov 2018 18:06:14 +0100 Subject: [PATCH] Added missing AVX512 intrinsics and flags. Fixes #291. --- CMakeLists.txt | 4 ++-- cmake/modules/FindSSE.cmake | 2 +- lib/include/srslte/phy/utils/simd.h | 16 ++++++++++++++-- lib/src/phy/utils/test/vector_test.c | 26 ++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ccc1ce54d..1d0f05568 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,8 +306,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") endif (HAVE_FMA) if (HAVE_AVX512) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512cd -DLV_HAVE_AVX512") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -DLV_HAVE_AVX512") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512cd -mavx512bw -mavx512dq -DLV_HAVE_AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512cd -mavx512bw -mavx512dq -DLV_HAVE_AVX512") endif(HAVE_AVX512) if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug") diff --git a/cmake/modules/FindSSE.cmake b/cmake/modules/FindSSE.cmake index e5101deff..24fc23662 100644 --- a/cmake/modules/FindSSE.cmake +++ b/cmake/modules/FindSSE.cmake @@ -142,7 +142,7 @@ if (ENABLE_SSE) # Check compiler for AVX intrinsics # if (CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) - set(CMAKE_REQUIRED_FLAGS "-mavx512f") + set(CMAKE_REQUIRED_FLAGS "-mavx512f -mavx512cd -mavx512bw -mavx512dq -DLV_HAVE_AVX512") check_c_source_runs(" #include int main() diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 491c1f661..2529033c5 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -1341,7 +1341,13 @@ static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) { static inline simd_s_t srslte_simd_s_neg(simd_s_t a, simd_s_t b) { #ifdef LV_HAVE_AVX512 -#error sign instruction not available in avx512 + __m256i a0 = _mm512_extracti64x4_epi64(a, 0); + __m256i a1 = _mm512_extracti64x4_epi64(a, 1); + __m256i b0 = _mm512_extracti64x4_epi64(b, 0); + __m256i b1 = _mm512_extracti64x4_epi64(b, 1); + __m256i r0 = _mm256_sign_epi16(a0, b0); + __m256i r1 = _mm256_sign_epi16(a1, b1); + return _mm512_inserti64x4(_mm512_broadcast_i64x4(r0), r1, 1); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX2 return _mm256_sign_epi16(a, b); @@ -1814,7 +1820,13 @@ static inline simd_s_t srslte_simd_b_sub(simd_s_t a, simd_s_t b) { static inline simd_s_t srslte_simd_b_neg(simd_b_t a, simd_b_t b) { #ifdef LV_HAVE_AVX512 -#error sign instruction not available in avx512 + __m256i a0 = _mm512_extracti64x4_epi64(a, 0); + __m256i a1 = _mm512_extracti64x4_epi64(a, 1); + __m256i b0 = _mm512_extracti64x4_epi64(b, 0); + __m256i b1 = _mm512_extracti64x4_epi64(b, 1); + __m256i r0 = _mm256_sign_epi8(a0, b0); + __m256i r1 = _mm256_sign_epi8(a1, b1); + return _mm512_inserti64x4(_mm512_broadcast_i64x4(r0), r1, 1); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX2 return _mm256_sign_epi8(a, b); diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c index 44f8af1ca..03e33305b 100644 --- a/lib/src/phy/utils/test/vector_test.c +++ b/lib/src/phy/utils/test/vector_test.c @@ -226,6 +226,29 @@ TEST(srslte_vec_prod_sss, free(z); ) +TEST(srslte_vec_neg_sss, + MALLOC(int16_t, x); + MALLOC(int16_t, y); + MALLOC(int16_t, z); + + int16_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_S(); + do { y[i] = RANDOM_S(); } while (!y[i]); + } + + TEST_CALL(srslte_vec_neg_sss(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = y[i] < 0 ? -x[i] : x[i]; + mse += abs(gold - z[i]); + } + + free(x); + free(y); + free(z); +) + TEST(srslte_vec_acc_cc, MALLOC(cf_t, x); cf_t z; @@ -868,6 +891,9 @@ int main(int argc, char **argv) { passed[func_count][size_count] = test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; + passed[func_count][size_count] = test_srslte_vec_neg_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + passed[func_count][size_count] = test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++;