PHY: Manually unroll loops that use _mm_extract_epi(8/16)

clang does not unroll those loops even though it supports the
-funroll-loops command line option, adding various #pragma unroll
options also does not help.

The unroll is needed to make the second argument a constant integer.

Enable the SSE/AVX turbo rate matching when compiling in debug mode.

srsLTE/lib/src/phy/fec/rm_turbo.c:590:33: error: argument to '__builtin_ia32_vec_ext_v16qi' must be a constant integer
          int8_t x  = (int8_t)  _mm_extract_epi8(xVal,   j+8);
                                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib64/clang/7.0.1/include/smmintrin.h:1048:23: note: expanded from macro '_mm_extract_epi8'
  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
                      ^
srsLTE/lib/src/phy/fec/rm_turbo.c:591:35: error: argument to '__builtin_ia32_vec_ext_v8hi' must be a constant integer
          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib64/clang/7.0.1/include/emmintrin.h:4273:24: note: expanded from macro '_mm_extract_epi16'
  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
This commit is contained in:
Vasil Velichkov 2019-10-17 02:22:37 +03:00 committed by Andre Puschmann
parent af2b4ecc79
commit a44671fc77
2 changed files with 158 additions and 76 deletions

View File

@ -32,12 +32,6 @@
#include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h"
#ifdef DEBUG_MODE
#pragma message "FIXME: Disabling SSE/AVX turbo rate matching"
#undef LV_HAVE_SSE
#undef LV_HAVE_AVX
#endif
#ifdef LV_HAVE_SSE
#include <x86intrin.h>
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
@ -470,6 +464,11 @@ int srslte_rm_turbo_rx_lut_8bit(int8_t *input, int8_t *output, uint32_t in_len,
#ifdef LV_HAVE_SSE
#define SAVE_OUTPUT_16_SSE(j) \
x = (int16_t)_mm_extract_epi16(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal, j); \
output[l] += x;
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
@ -478,18 +477,25 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
const __m128i* xPtr = (const __m128i*) input;
const __m128i* lutPtr = (const __m128i*) deinter;
__m128i xVal, lutVal;
int16_t x;
uint16_t l;
/* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) {
for (int i=0;i<in_len/8;i++) {
xVal = _mm_loadu_si128(xPtr);
lutVal = _mm_loadu_si128(lutPtr);
for (int j=0;j<8;j++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
output[l] += x;
}
SAVE_OUTPUT_16_SSE(0);
SAVE_OUTPUT_16_SSE(1);
SAVE_OUTPUT_16_SSE(2);
SAVE_OUTPUT_16_SSE(3);
SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
xPtr ++;
lutPtr ++;
}
@ -503,12 +509,16 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
while(inputCnt < in_len - 8) {
xVal = _mm_loadu_si128(xPtr);
lutVal = _mm_loadu_si128(lutPtr);
for (int j=0;j<8;j++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
output[l] += x;
}
SAVE_OUTPUT_16_SSE(0);
SAVE_OUTPUT_16_SSE(1);
SAVE_OUTPUT_16_SSE(2);
SAVE_OUTPUT_16_SSE(3);
SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
xPtr++;
lutPtr++;
intCnt += 8;
@ -539,6 +549,16 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
}
}
#define SAVE_OUTPUT_SSE_8(j) \
x = (int8_t)_mm_extract_epi8(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal1, j); \
output[l] += x;
#define SAVE_OUTPUT_SSE_8_2(j) \
x = (int8_t)_mm_extract_epi8(xVal, j + 8); \
l = (uint16_t)_mm_extract_epi16(lutVal2, j); \
output[l] += x;
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
@ -548,6 +568,9 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
const __m128i* lutPtr = (const __m128i*) deinter;
__m128i xVal, lutVal1, lutVal2;
int8_t x;
uint16_t l;
/* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) {
for (int i=0;i<in_len/16;i++) {
@ -558,16 +581,23 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
lutVal2 = _mm_loadu_si128(lutPtr);
lutPtr ++;
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
output[l] += x;
}
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
output[l] += x;
}
SAVE_OUTPUT_SSE_8(0);
SAVE_OUTPUT_SSE_8(1);
SAVE_OUTPUT_SSE_8(2);
SAVE_OUTPUT_SSE_8(3);
SAVE_OUTPUT_SSE_8(4);
SAVE_OUTPUT_SSE_8(5);
SAVE_OUTPUT_SSE_8(6);
SAVE_OUTPUT_SSE_8(7);
SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
}
for (int i=16*(in_len/16);i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
@ -584,16 +614,24 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
lutVal2 = _mm_loadu_si128(lutPtr);
lutPtr ++;
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
output[l] += x;
}
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
output[l] += x;
}
SAVE_OUTPUT_SSE_8(0);
SAVE_OUTPUT_SSE_8(1);
SAVE_OUTPUT_SSE_8(2);
SAVE_OUTPUT_SSE_8(3);
SAVE_OUTPUT_SSE_8(4);
SAVE_OUTPUT_SSE_8(5);
SAVE_OUTPUT_SSE_8(6);
SAVE_OUTPUT_SSE_8(7);
SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
intCnt += 16;
inputCnt += 16;
if (intCnt >= out_len && inputCnt < in_len - 16) {
@ -635,9 +673,10 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
#ifdef LV_HAVE_AVX
#define SAVE_OUTPUT(j) x = (int16_t) _mm256_extract_epi16(xVal, j);\
l = (uint16_t) _mm256_extract_epi16(lutVal, j);\
output[l] += x;
#define SAVE_OUTPUT(j) \
x = (int16_t)_mm256_extract_epi16(xVal, j); \
l = (uint16_t)_mm256_extract_epi16(lutVal, j); \
output[l] += x;
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{

View File

@ -276,35 +276,50 @@ void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const
}
}
#define SAVE_OUTPUT_16_SSE(j) \
x = (int16_t)_mm_extract_epi16(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal, j); \
y[l] = (short)x;
/* No improvement with AVX */
void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) {
int i = 0;
#ifdef LV_HAVE_SSE
#ifndef DEBUG_MODE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 7; i += 8) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
__m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
for (int k = 0; k < 8; k++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
y[l] = (short) x;
}
int16_t x;
uint16_t l;
SAVE_OUTPUT_16_SSE(0);
SAVE_OUTPUT_16_SSE(1);
SAVE_OUTPUT_16_SSE(2);
SAVE_OUTPUT_16_SSE(3);
SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
}
} else {
for (; i < len - 7; i += 8) {
__m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
__m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
for (int k = 0; k < 8; k++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
y[l] = (short) x;
}
int16_t x;
uint16_t l;
SAVE_OUTPUT_16_SSE(0);
SAVE_OUTPUT_16_SSE(1);
SAVE_OUTPUT_16_SSE(2);
SAVE_OUTPUT_16_SSE(3);
SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
}
}
#endif
#endif
for (; i < len; i++) {
@ -312,26 +327,45 @@ void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y
}
}
#define SAVE_OUTPUT_SSE_8(j) \
x = (int8_t)_mm_extract_epi8(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal1, j); \
y[l] = (char)x;
#define SAVE_OUTPUT_SSE_8_2(j) \
x = (int8_t)_mm_extract_epi8(xVal, j + 8); \
l = (uint16_t)_mm_extract_epi16(lutVal2, j); \
y[l] = (char)x;
void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len) {
int i = 0;
#ifdef LV_HAVE_SSE
#ifndef DEBUG_MODE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 15; i += 16) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
__m128i lutVal1 = _mm_load_si128((__m128i *) &lut[i]);
__m128i lutVal2 = _mm_load_si128((__m128i *) &lut[i+8]);
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
y[l] = (char) x;
}
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k);
y[l] = (char) x;
}
int8_t x;
uint16_t l;
SAVE_OUTPUT_SSE_8(0);
SAVE_OUTPUT_SSE_8(1);
SAVE_OUTPUT_SSE_8(2);
SAVE_OUTPUT_SSE_8(3);
SAVE_OUTPUT_SSE_8(4);
SAVE_OUTPUT_SSE_8(5);
SAVE_OUTPUT_SSE_8(6);
SAVE_OUTPUT_SSE_8(7);
SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
}
} else {
for (; i < len - 15; i += 16) {
@ -339,19 +373,28 @@ void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t
__m128i lutVal1 = _mm_loadu_si128((__m128i *) &lut[i]);
__m128i lutVal2 = _mm_loadu_si128((__m128i *) &lut[i+8]);
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
y[l] = (char) x;
}
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k);
y[l] = (char) x;
}
int8_t x;
uint16_t l;
SAVE_OUTPUT_SSE_8(0);
SAVE_OUTPUT_SSE_8(1);
SAVE_OUTPUT_SSE_8(2);
SAVE_OUTPUT_SSE_8(3);
SAVE_OUTPUT_SSE_8(4);
SAVE_OUTPUT_SSE_8(5);
SAVE_OUTPUT_SSE_8(6);
SAVE_OUTPUT_SSE_8(7);
SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
}
}
#endif
#endif
for (; i < len; i++) {