mirror of https://github.com/PentHertz/srsLTE.git
PHY: Manually unroll loops that use _mm_extract_epi(8/16)
clang does not unroll those loops even though it supports the -funroll-loops command line option, adding various #pragma unroll options also does not help. The unroll is needed to make the second argument a constant integer. Enable the SSE/AVX turbo rate matching when compiling in debug mode. srsLTE/lib/src/phy/fec/rm_turbo.c:590:33: error: argument to '__builtin_ia32_vec_ext_v16qi' must be a constant integer int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/clang/7.0.1/include/smmintrin.h:1048:23: note: expanded from macro '_mm_extract_epi8' (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ ^ srsLTE/lib/src/phy/fec/rm_turbo.c:591:35: error: argument to '__builtin_ia32_vec_ext_v8hi' must be a constant integer uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/clang/7.0.1/include/emmintrin.h:4273:24: note: expanded from macro '_mm_extract_epi16' (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
This commit is contained in:
parent
af2b4ecc79
commit
a44671fc77
|
@ -32,12 +32,6 @@
|
|||
#include "srslte/phy/utils/debug.h"
|
||||
#include "srslte/phy/utils/vector.h"
|
||||
|
||||
#ifdef DEBUG_MODE
|
||||
#pragma message "FIXME: Disabling SSE/AVX turbo rate matching"
|
||||
#undef LV_HAVE_SSE
|
||||
#undef LV_HAVE_AVX
|
||||
#endif
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <x86intrin.h>
|
||||
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
|
||||
|
@ -470,6 +464,11 @@ int srslte_rm_turbo_rx_lut_8bit(int8_t *input, int8_t *output, uint32_t in_len,
|
|||
|
||||
#ifdef LV_HAVE_SSE
|
||||
|
||||
#define SAVE_OUTPUT_16_SSE(j) \
|
||||
x = (int16_t)_mm_extract_epi16(xVal, j); \
|
||||
l = (uint16_t)_mm_extract_epi16(lutVal, j); \
|
||||
output[l] += x;
|
||||
|
||||
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
|
||||
{
|
||||
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
|
||||
|
@ -478,18 +477,25 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
|
|||
const __m128i* xPtr = (const __m128i*) input;
|
||||
const __m128i* lutPtr = (const __m128i*) deinter;
|
||||
__m128i xVal, lutVal;
|
||||
|
||||
|
||||
int16_t x;
|
||||
uint16_t l;
|
||||
|
||||
/* Simplify load if we do not need to wrap (ie high rates) */
|
||||
if (in_len <= out_len) {
|
||||
for (int i=0;i<in_len/8;i++) {
|
||||
xVal = _mm_loadu_si128(xPtr);
|
||||
lutVal = _mm_loadu_si128(lutPtr);
|
||||
|
||||
for (int j=0;j<8;j++) {
|
||||
int16_t x = (int16_t) _mm_extract_epi16(xVal, j);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
|
||||
output[l] += x;
|
||||
}
|
||||
|
||||
SAVE_OUTPUT_16_SSE(0);
|
||||
SAVE_OUTPUT_16_SSE(1);
|
||||
SAVE_OUTPUT_16_SSE(2);
|
||||
SAVE_OUTPUT_16_SSE(3);
|
||||
SAVE_OUTPUT_16_SSE(4);
|
||||
SAVE_OUTPUT_16_SSE(5);
|
||||
SAVE_OUTPUT_16_SSE(6);
|
||||
SAVE_OUTPUT_16_SSE(7);
|
||||
|
||||
xPtr ++;
|
||||
lutPtr ++;
|
||||
}
|
||||
|
@ -503,12 +509,16 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
|
|||
while(inputCnt < in_len - 8) {
|
||||
xVal = _mm_loadu_si128(xPtr);
|
||||
lutVal = _mm_loadu_si128(lutPtr);
|
||||
|
||||
for (int j=0;j<8;j++) {
|
||||
int16_t x = (int16_t) _mm_extract_epi16(xVal, j);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
|
||||
output[l] += x;
|
||||
}
|
||||
|
||||
SAVE_OUTPUT_16_SSE(0);
|
||||
SAVE_OUTPUT_16_SSE(1);
|
||||
SAVE_OUTPUT_16_SSE(2);
|
||||
SAVE_OUTPUT_16_SSE(3);
|
||||
SAVE_OUTPUT_16_SSE(4);
|
||||
SAVE_OUTPUT_16_SSE(5);
|
||||
SAVE_OUTPUT_16_SSE(6);
|
||||
SAVE_OUTPUT_16_SSE(7);
|
||||
|
||||
xPtr++;
|
||||
lutPtr++;
|
||||
intCnt += 8;
|
||||
|
@ -539,6 +549,16 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
|
|||
}
|
||||
}
|
||||
|
||||
#define SAVE_OUTPUT_SSE_8(j) \
|
||||
x = (int8_t)_mm_extract_epi8(xVal, j); \
|
||||
l = (uint16_t)_mm_extract_epi16(lutVal1, j); \
|
||||
output[l] += x;
|
||||
|
||||
#define SAVE_OUTPUT_SSE_8_2(j) \
|
||||
x = (int8_t)_mm_extract_epi8(xVal, j + 8); \
|
||||
l = (uint16_t)_mm_extract_epi16(lutVal2, j); \
|
||||
output[l] += x;
|
||||
|
||||
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
|
||||
{
|
||||
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
|
||||
|
@ -548,6 +568,9 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
|
|||
const __m128i* lutPtr = (const __m128i*) deinter;
|
||||
__m128i xVal, lutVal1, lutVal2;
|
||||
|
||||
int8_t x;
|
||||
uint16_t l;
|
||||
|
||||
/* Simplify load if we do not need to wrap (ie high rates) */
|
||||
if (in_len <= out_len) {
|
||||
for (int i=0;i<in_len/16;i++) {
|
||||
|
@ -558,16 +581,23 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
|
|||
lutVal2 = _mm_loadu_si128(lutPtr);
|
||||
lutPtr ++;
|
||||
|
||||
for (int j=0;j<8;j++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, j);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
|
||||
output[l] += x;
|
||||
}
|
||||
for (int j=0;j<8;j++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
|
||||
output[l] += x;
|
||||
}
|
||||
SAVE_OUTPUT_SSE_8(0);
|
||||
SAVE_OUTPUT_SSE_8(1);
|
||||
SAVE_OUTPUT_SSE_8(2);
|
||||
SAVE_OUTPUT_SSE_8(3);
|
||||
SAVE_OUTPUT_SSE_8(4);
|
||||
SAVE_OUTPUT_SSE_8(5);
|
||||
SAVE_OUTPUT_SSE_8(6);
|
||||
SAVE_OUTPUT_SSE_8(7);
|
||||
|
||||
SAVE_OUTPUT_SSE_8_2(0);
|
||||
SAVE_OUTPUT_SSE_8_2(1);
|
||||
SAVE_OUTPUT_SSE_8_2(2);
|
||||
SAVE_OUTPUT_SSE_8_2(3);
|
||||
SAVE_OUTPUT_SSE_8_2(4);
|
||||
SAVE_OUTPUT_SSE_8_2(5);
|
||||
SAVE_OUTPUT_SSE_8_2(6);
|
||||
SAVE_OUTPUT_SSE_8_2(7);
|
||||
}
|
||||
for (int i=16*(in_len/16);i<in_len;i++) {
|
||||
output[deinter[i%out_len]] += input[i];
|
||||
|
@ -584,16 +614,24 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
|
|||
lutVal2 = _mm_loadu_si128(lutPtr);
|
||||
lutPtr ++;
|
||||
|
||||
for (int j=0;j<8;j++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, j);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
|
||||
output[l] += x;
|
||||
}
|
||||
for (int j=0;j<8;j++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
|
||||
output[l] += x;
|
||||
}
|
||||
SAVE_OUTPUT_SSE_8(0);
|
||||
SAVE_OUTPUT_SSE_8(1);
|
||||
SAVE_OUTPUT_SSE_8(2);
|
||||
SAVE_OUTPUT_SSE_8(3);
|
||||
SAVE_OUTPUT_SSE_8(4);
|
||||
SAVE_OUTPUT_SSE_8(5);
|
||||
SAVE_OUTPUT_SSE_8(6);
|
||||
SAVE_OUTPUT_SSE_8(7);
|
||||
|
||||
SAVE_OUTPUT_SSE_8_2(0);
|
||||
SAVE_OUTPUT_SSE_8_2(1);
|
||||
SAVE_OUTPUT_SSE_8_2(2);
|
||||
SAVE_OUTPUT_SSE_8_2(3);
|
||||
SAVE_OUTPUT_SSE_8_2(4);
|
||||
SAVE_OUTPUT_SSE_8_2(5);
|
||||
SAVE_OUTPUT_SSE_8_2(6);
|
||||
SAVE_OUTPUT_SSE_8_2(7);
|
||||
|
||||
intCnt += 16;
|
||||
inputCnt += 16;
|
||||
if (intCnt >= out_len && inputCnt < in_len - 16) {
|
||||
|
@ -635,9 +673,10 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
|
|||
|
||||
#ifdef LV_HAVE_AVX
|
||||
|
||||
#define SAVE_OUTPUT(j) x = (int16_t) _mm256_extract_epi16(xVal, j);\
|
||||
l = (uint16_t) _mm256_extract_epi16(lutVal, j);\
|
||||
output[l] += x;
|
||||
#define SAVE_OUTPUT(j) \
|
||||
x = (int16_t)_mm256_extract_epi16(xVal, j); \
|
||||
l = (uint16_t)_mm256_extract_epi16(lutVal, j); \
|
||||
output[l] += x;
|
||||
|
||||
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
|
||||
{
|
||||
|
|
|
@ -276,35 +276,50 @@ void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const
|
|||
}
|
||||
}
|
||||
|
||||
#define SAVE_OUTPUT_16_SSE(j) \
|
||||
x = (int16_t)_mm_extract_epi16(xVal, j); \
|
||||
l = (uint16_t)_mm_extract_epi16(lutVal, j); \
|
||||
y[l] = (short)x;
|
||||
|
||||
/* No improvement with AVX */
|
||||
void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) {
|
||||
int i = 0;
|
||||
#ifdef LV_HAVE_SSE
|
||||
#ifndef DEBUG_MODE
|
||||
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
|
||||
for (; i < len - 7; i += 8) {
|
||||
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
|
||||
__m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
|
||||
|
||||
for (int k = 0; k < 8; k++) {
|
||||
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
|
||||
y[l] = (short) x;
|
||||
}
|
||||
int16_t x;
|
||||
uint16_t l;
|
||||
|
||||
SAVE_OUTPUT_16_SSE(0);
|
||||
SAVE_OUTPUT_16_SSE(1);
|
||||
SAVE_OUTPUT_16_SSE(2);
|
||||
SAVE_OUTPUT_16_SSE(3);
|
||||
SAVE_OUTPUT_16_SSE(4);
|
||||
SAVE_OUTPUT_16_SSE(5);
|
||||
SAVE_OUTPUT_16_SSE(6);
|
||||
SAVE_OUTPUT_16_SSE(7);
|
||||
}
|
||||
} else {
|
||||
for (; i < len - 7; i += 8) {
|
||||
__m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
|
||||
__m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
|
||||
|
||||
for (int k = 0; k < 8; k++) {
|
||||
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
|
||||
y[l] = (short) x;
|
||||
}
|
||||
int16_t x;
|
||||
uint16_t l;
|
||||
|
||||
SAVE_OUTPUT_16_SSE(0);
|
||||
SAVE_OUTPUT_16_SSE(1);
|
||||
SAVE_OUTPUT_16_SSE(2);
|
||||
SAVE_OUTPUT_16_SSE(3);
|
||||
SAVE_OUTPUT_16_SSE(4);
|
||||
SAVE_OUTPUT_16_SSE(5);
|
||||
SAVE_OUTPUT_16_SSE(6);
|
||||
SAVE_OUTPUT_16_SSE(7);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (; i < len; i++) {
|
||||
|
@ -312,26 +327,45 @@ void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y
|
|||
}
|
||||
}
|
||||
|
||||
#define SAVE_OUTPUT_SSE_8(j) \
|
||||
x = (int8_t)_mm_extract_epi8(xVal, j); \
|
||||
l = (uint16_t)_mm_extract_epi16(lutVal1, j); \
|
||||
y[l] = (char)x;
|
||||
|
||||
#define SAVE_OUTPUT_SSE_8_2(j) \
|
||||
x = (int8_t)_mm_extract_epi8(xVal, j + 8); \
|
||||
l = (uint16_t)_mm_extract_epi16(lutVal2, j); \
|
||||
y[l] = (char)x;
|
||||
|
||||
void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len) {
|
||||
int i = 0;
|
||||
#ifdef LV_HAVE_SSE
|
||||
#ifndef DEBUG_MODE
|
||||
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
|
||||
for (; i < len - 15; i += 16) {
|
||||
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
|
||||
__m128i lutVal1 = _mm_load_si128((__m128i *) &lut[i]);
|
||||
__m128i lutVal2 = _mm_load_si128((__m128i *) &lut[i+8]);
|
||||
|
||||
for (int k = 0; k < 8; k++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, k);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
|
||||
y[l] = (char) x;
|
||||
}
|
||||
for (int k = 0; k < 8; k++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k);
|
||||
y[l] = (char) x;
|
||||
}
|
||||
int8_t x;
|
||||
uint16_t l;
|
||||
|
||||
SAVE_OUTPUT_SSE_8(0);
|
||||
SAVE_OUTPUT_SSE_8(1);
|
||||
SAVE_OUTPUT_SSE_8(2);
|
||||
SAVE_OUTPUT_SSE_8(3);
|
||||
SAVE_OUTPUT_SSE_8(4);
|
||||
SAVE_OUTPUT_SSE_8(5);
|
||||
SAVE_OUTPUT_SSE_8(6);
|
||||
SAVE_OUTPUT_SSE_8(7);
|
||||
|
||||
SAVE_OUTPUT_SSE_8_2(0);
|
||||
SAVE_OUTPUT_SSE_8_2(1);
|
||||
SAVE_OUTPUT_SSE_8_2(2);
|
||||
SAVE_OUTPUT_SSE_8_2(3);
|
||||
SAVE_OUTPUT_SSE_8_2(4);
|
||||
SAVE_OUTPUT_SSE_8_2(5);
|
||||
SAVE_OUTPUT_SSE_8_2(6);
|
||||
SAVE_OUTPUT_SSE_8_2(7);
|
||||
}
|
||||
} else {
|
||||
for (; i < len - 15; i += 16) {
|
||||
|
@ -339,19 +373,28 @@ void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t
|
|||
__m128i lutVal1 = _mm_loadu_si128((__m128i *) &lut[i]);
|
||||
__m128i lutVal2 = _mm_loadu_si128((__m128i *) &lut[i+8]);
|
||||
|
||||
for (int k = 0; k < 8; k++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, k);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
|
||||
y[l] = (char) x;
|
||||
}
|
||||
for (int k = 0; k < 8; k++) {
|
||||
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8);
|
||||
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k);
|
||||
y[l] = (char) x;
|
||||
}
|
||||
int8_t x;
|
||||
uint16_t l;
|
||||
|
||||
SAVE_OUTPUT_SSE_8(0);
|
||||
SAVE_OUTPUT_SSE_8(1);
|
||||
SAVE_OUTPUT_SSE_8(2);
|
||||
SAVE_OUTPUT_SSE_8(3);
|
||||
SAVE_OUTPUT_SSE_8(4);
|
||||
SAVE_OUTPUT_SSE_8(5);
|
||||
SAVE_OUTPUT_SSE_8(6);
|
||||
SAVE_OUTPUT_SSE_8(7);
|
||||
|
||||
SAVE_OUTPUT_SSE_8_2(0);
|
||||
SAVE_OUTPUT_SSE_8_2(1);
|
||||
SAVE_OUTPUT_SSE_8_2(2);
|
||||
SAVE_OUTPUT_SSE_8_2(3);
|
||||
SAVE_OUTPUT_SSE_8_2(4);
|
||||
SAVE_OUTPUT_SSE_8_2(5);
|
||||
SAVE_OUTPUT_SSE_8_2(6);
|
||||
SAVE_OUTPUT_SSE_8_2(7);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for (; i < len; i++) {
|
||||
|
|
Loading…
Reference in New Issue