From 3762738bc11e69ea5e888918b2fa647ea366b89a Mon Sep 17 00:00:00 2001 From: yagoda Date: Wed, 12 Dec 2018 11:27:07 +0100 Subject: [PATCH] adding windowed neon turbodecoder (can be further optimized) --- .../srslte/phy/fec/turbodecoder_impl.h | 1 + lib/include/srslte/phy/fec/turbodecoder_win.h | 126 +++++++++++++++++- lib/src/phy/fec/test/turbodecoder_test.c | 7 +- lib/src/phy/fec/turbodecoder.c | 38 +++++- srsue/src/phy/phch_recv.cc | 3 - 5 files changed, 163 insertions(+), 12 deletions(-) diff --git a/lib/include/srslte/phy/fec/turbodecoder_impl.h b/lib/include/srslte/phy/fec/turbodecoder_impl.h index 1fe0a5321..fa39626ee 100644 --- a/lib/include/srslte/phy/fec/turbodecoder_impl.h +++ b/lib/include/srslte/phy/fec/turbodecoder_impl.h @@ -35,6 +35,7 @@ typedef enum SRSLTE_API { SRSLTE_TDEC_GENERIC, SRSLTE_TDEC_SSE, SRSLTE_TDEC_SSE_WINDOW, + SRSLTE_TDEC_NEON_WINDOW, SRSLTE_TDEC_AVX_WINDOW, SRSLTE_TDEC_SSE8_WINDOW, SRSLTE_TDEC_AVX8_WINDOW, diff --git a/lib/include/srslte/phy/fec/turbodecoder_win.h b/lib/include/srslte/phy/fec/turbodecoder_win.h index bd52e6284..ffa4bf4c9 100644 --- a/lib/include/srslte/phy/fec/turbodecoder_win.h +++ b/lib/include/srslte/phy/fec/turbodecoder_win.h @@ -178,9 +178,102 @@ } +#else +#if HAVE_NEON + #include + + #define WINIMP arm16 + #define nof_blocks 8 + + #define llr_t int16_t + + + + #define v_insert_s16(a, b, imm) \ + ({ \ + (vsetq_lane_s16((b), (a), (imm))); \ + }) + + #define int8x16_to_8x8x2(v) ((int8x8x2_t) {{ vget_low_s8(v), vget_high_s8(v) }})// TODO + +static inline int movemask_neon(uint8x16_t movemask_low_in) { + + uint8x8_t mask_and = vdup_n_u8(0x80); + int8_t __attribute__((aligned(16))) xr[8]; + for(int i = 0; i <8;i++) + xr[i] = i-7; + + int8x8_t mask_shift = vld1_s8(xr); + uint8x8_t lo = vget_low_u8(movemask_low_in); + uint8x8_t hi = vget_high_u8(movemask_low_in); + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ((hi[0] << 8) | (lo[0] & 0xFF)); +} + inline static int16x8_t vshuff_s8(int16x8_t in, uint8x16_t mask) +{ + int8x8x2_t x = int8x16_to_8x8x2((int8x16_t)in); + int8x8_t u = (int8x8_t)vget_low_u8(mask); + int8x8_t eq = vtbl2_s8(x,u); + + int8x8x2_t x2 = int8x16_to_8x8x2((int8x16_t)in); + int8x8_t u2 = (int8x8_t)vget_high_u8(mask); + int8x8_t eq2 = vtbl2_s8(x2,u2); + return (int16x8_t)vcombine_s8(eq,eq2); +} + static inline int16x8_t v_packs_s16(int16x8_t a, int16x8_t b) +{ + return (int16x8_t)(vcombine_s8(vqmovn_s16((a)), vqmovn_s16((b)))); +} + + +inline static int16x8_t v_srai_s16(const int16x8_t a, const int count) { + int16x8_t b = vmovq_n_s16(-count); + return vshlq_s16(a,b); +} +inline static uint8x16_t v_load_s8(int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) +{ + uint8_t __attribute__((aligned(16))) data[16] = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; + return vld1q_u8(data); +} + + + #define simd_type_t int16x8_t + #define simd_load(x) vld1q_s16((int16_t*)x) + #define simd_store(x,y) vst1q_s16((int16_t*)x,y) + #define simd_add vaddq_s16 + #define simd_sub vsubq_s16 + #define simd_max vmaxq_s16 + #define simd_set1 vdupq_n_s16 + #define simd_insert v_insert_s16 + #define simd_shuffle vshuff_s8 + #define move_right v_load_s8(15,14,15,14,13,12,11,10,9,8,7,6,5,4,3,2) + #define move_left v_load_s8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0) + #define simd_rb_shift v_srai_s16 + + #define normalize_period 2 + #define win_overlap_len 40 + +#define divide_output 1 + +#define INF 10000 + #else #error "Unknown WINIMP value" #endif + +#endif #endif #endif #endif @@ -681,10 +774,20 @@ void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_ k -= (long_cb-1);\ }\ } + + + +#ifdef HAVE_NEON +#define insert_bit(a,b) ap = v_insert_s16(ap, app1[k+(a%b)*nof_blocks], 7-a); \ + reset_cnt(a,b); +#else #define insert_bit(a,b) ap = _mm_insert_epi16(ap, app1[k+(a%b)*nof_blocks], 7-a); \ - reset_cnt(a,b); \ + reset_cnt(a,b); +#endif + +#ifndef HAVE_NEON #define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \ insert_bit(0,b);\ insert_bit(1,b);\ @@ -696,14 +799,31 @@ void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_ insert_bit(7,b);\ output[i] = (uint8_t) _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_packs_epi16(ap,zeros),zeros));\ } - +#else +#define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \ + insert_bit(0,b);\ + insert_bit(1,b);\ + insert_bit(2,b);\ + insert_bit(3,b);\ + insert_bit(4,b);\ + insert_bit(5,b);\ + insert_bit(6,b);\ + insert_bit(7,b);\ + output[i] = (uint8_t) movemask_neon((uint8x16_t)vcgtq_s8((int8x16_t)v_packs_s16(ap,(int16x8_t)zeros),zeros));\ + } +#endif /* No improvement to use AVX here */ void MAKE_FUNC(decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb) { uint32_t k=0; +#ifdef HAVE_NEON + int8_t z = 0; + int8x16_t zeros = vld1q_dup_s8(&z); + int16x8_t ap; +#else __m128i zeros = _mm_setzero_si128(); __m128i ap; - +#endif if ((long_cb%(nof_blocks*8)) == 0) { decide_for(8); } else if ((long_cb%(nof_blocks*4)) == 0) { diff --git a/lib/src/phy/fec/test/turbodecoder_test.c b/lib/src/phy/fec/test/turbodecoder_test.c index 58fe3ddb5..e80512518 100644 --- a/lib/src/phy/fec/test/turbodecoder_test.c +++ b/lib/src/phy/fec/test/turbodecoder_test.c @@ -197,11 +197,16 @@ int main(int argc, char **argv) { exit(-1); } +#ifdef HAVE_NEON + tdec_type = SRSLTE_TDEC_NEON_WINDOW; +#else + // tdec_type = SRSLTE_TDEC_SSE_WINDOW; +#endif if (srslte_tdec_init_manual(&tdec, frame_length, tdec_type)) { fprintf(stderr, "Error initiating Turbo decoder\n"); exit(-1); } - + srslte_tdec_force_not_sb(&tdec); float ebno_inc, esno_db; diff --git a/lib/src/phy/fec/turbodecoder.c b/lib/src/phy/fec/turbodecoder.c index 0ae094ecf..8fa41c466 100644 --- a/lib/src/phy/fec/turbodecoder.c +++ b/lib/src/phy/fec/turbodecoder.c @@ -100,6 +100,22 @@ srslte_tdec_8bit_impl_t sse8_win_impl = { }; #endif + +#ifdef HAVE_NEON +//#include "srslte/phy/fec/turbodecoder_neon.h" +#define WINIMP_IS_NEON16 +#include "srslte/phy/fec/turbodecoder_win.h" +#undef WINIMP_IS_NEON16 + +srslte_tdec_16bit_impl_t arm16_win_impl = { + tdec_winarm16_init, + tdec_winarm16_free, + tdec_winarm16_dec, + tdec_winarm16_extract_input, + tdec_winarm16_decision_byte +}; +#endif + /* AVX window implementation */ #ifdef LV_HAVE_AVX2 #define WINIMP_IS_AVX8 @@ -119,6 +135,8 @@ srslte_tdec_8bit_impl_t avx8_win_impl = { #define AUTO_16_AVXWIN 2 #define AUTO_8_SSEWIN 0 #define AUTO_8_AVXWIN 1 +#define AUTO_16_GEN 0 +#define AUTO_16_NEONWIN 1 // Include interfaces for 8 and 16 bit decoder implementations @@ -177,10 +195,17 @@ int srslte_tdec_init_manual(srslte_tdec_t * h, uint32_t max_long_cb, srslte_tdec h->current_llr_type = SRSLTE_TDEC_8; break; #endif +#ifdef HAVE_NEON + case SRSLTE_TDEC_NEON_WINDOW: + h->dec16[0] = &arm16_win_impl; + h->current_llr_type = SRSLTE_TDEC_16; + break; +#else case SRSLTE_TDEC_GENERIC: h->dec16[0] = &gen_impl; h->current_llr_type = SRSLTE_TDEC_16; break; +#endif #ifdef LV_HAVE_AVX2 case SRSLTE_TDEC_AVX_WINDOW: h->dec16[0] = &avx16_win_impl; @@ -241,18 +266,21 @@ int srslte_tdec_init_manual(srslte_tdec_t * h, uint32_t max_long_cb, srslte_tdec if (dec_type == SRSLTE_TDEC_AUTO) { #ifdef HAVE_NEON - h->dec16[0] = &gen_impl; - h->current_llr_type = SRSLTE_TDEC_16; - //h->dec8[0] = &gen_impl; -#else - h->dec16[AUTO_16_SSE] = &sse_impl; + h->dec16[AUTO_16_GEN] = &gen_impl; + h->dec16[AUTO_16_NEONWIN] = &arm16_win_impl; +#elif LV_HAVE_SSE + h->dec16[AUTO_16_SSE] = &gen_impl; h->dec16[AUTO_16_SSEWIN] = &sse16_win_impl; h->dec8[AUTO_8_SSEWIN] = &sse8_win_impl; #ifdef LV_HAVE_AVX2 h->dec16[AUTO_16_AVXWIN] = &avx16_win_impl; h->dec8[AUTO_8_AVXWIN] = &avx8_win_impl; #endif +#else + h->dec16[AUTO_16_SSE] = &gen_impl; + h->dec16[AUTO_16_SSEWIN] = &gen_impl; #endif /* HAVE_NEON */ + for (int td=0;tddec16[td]) { if ((h->nof_blocks16[td] = h->dec16[td]->tdec_init(&h->dec16_hdlr[td], h->max_long_cb))<0) { diff --git a/srsue/src/phy/phch_recv.cc b/srsue/src/phy/phch_recv.cc index 1dde96f84..3cac3509c 100644 --- a/srsue/src/phy/phch_recv.cc +++ b/srsue/src/phy/phch_recv.cc @@ -1499,9 +1499,6 @@ int phch_recv::scell_recv::find_cells(cf_t *input_buffer, float rx_gain_offset, */ void phch_recv::meas_reset() { - if (enable_raa_searcher) { - raa_searcher->stop(-1); - } // Stop all measurements intra_freq_meas.clear_cells(); }