/** * * \section COPYRIGHT * * Copyright 2013-2015 Software Radio Systems Limited * * \section LICENSE * * This file is part of the srsLTE library. * * srsLTE is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * srsLTE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * A copy of the GNU Affero General Public License can be found in * the LICENSE file in the top-level directory of this distribution * and at http://www.gnu.org/licenses/. * */ #include #include #include #include #include #include #include "srslte/fec/turbodecoder_sse.h" #include "srslte/utils/vector.h" #include #ifdef LV_HAVE_SSE #include #endif #define NUMSTATES 8 #define NINPUTS 2 #define TAIL 3 #define TOTALTAIL 12 #define INF 10000 #define ZERO 0 #ifdef LV_HAVE_SSE /* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ static inline int16_t hMax(__m128i buffer) { __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer); __m128i tmp3 = _mm_minpos_epu16(tmp1); return (int16_t)(_mm_cvtsi128_si32(tmp3)); } /* Computes beta values */ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) { int k; uint32_t end = long_cb + 3; const __m128i *alphaPtr = (const __m128i*) s->alpha; __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); __m128i g, bp, bn, alpha_k; /* Define the shuffle constant for the positive beta */ __m128i shuf_bp = _mm_set_epi8( 15, 14, // 7 7, 6, // 3 5, 4, // 2 13, 12, // 6 11, 10, // 5 3, 2, // 1 1, 0, // 0 9, 8 // 4 ); /* Define the shuffle constant for the negative beta */ __m128i shuf_bn = _mm_set_epi8( 7, 6, // 3 15, 14, // 7 13, 12, // 6 5, 4, // 2 3, 2, // 1 11, 10, // 5 9, 8, // 4 1, 0 // 0 ); alphaPtr += long_cb-1; /* Define shuffle for branch costs */ __m128i shuf_g[4]; shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); __m128i gv; int16_t *b = &s->branch[2*long_cb-8]; __m128i *gPtr = (__m128i*) b; /* Define shuffle for beta normalization */ __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); /* This defines a beta computation step: * Adds and substracts the branch metrics to the previous beta step, * shuffles the states according to the trellis path and selects maximum state */ #define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ bn = _mm_sub_epi16(beta_k, g);\ bp = _mm_shuffle_epi8(bp, shuf_bp);\ bn = _mm_shuffle_epi8(bn, shuf_bn);\ beta_k = _mm_max_epi16(bp, bn); /* Loads the alpha metrics from memory and adds them to the temporal bn and bp * metrics. Then computes horizontal maximum of both metrics and computes difference */ #define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ BETA_STEP(g)\ alpha_k = _mm_load_si128(alphaPtr);\ alphaPtr--;\ bp = _mm_add_epi16(bp, alpha_k);\ bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp); /* The tail does not require to load alpha or produce outputs. Only update * beta metrics accordingly */ for (k=end-1; k>=long_cb; k--) { int16_t g0 = s->branch[2*k]; int16_t g1 = s->branch[2*k+1]; g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); BETA_STEP(g); } /* We inline 2 trelis steps for each normalization */ __m128i norm; for (; k >= 0; k-=8) { gv = _mm_load_si128(gPtr); gPtr--; BETA_STEP_CNT(0,0); BETA_STEP_CNT(1,1); BETA_STEP_CNT(2,2); BETA_STEP_CNT(3,3); norm = _mm_shuffle_epi8(beta_k, shuf_norm); beta_k = _mm_sub_epi16(beta_k, norm); gv = _mm_load_si128(gPtr); gPtr--; BETA_STEP_CNT(0,4); BETA_STEP_CNT(1,5); BETA_STEP_CNT(2,6); BETA_STEP_CNT(3,7); norm = _mm_shuffle_epi8(beta_k, shuf_norm); beta_k = _mm_sub_epi16(beta_k, norm); } } /* Computes alpha metrics */ void map_gen_alpha(map_gen_t * s, uint32_t long_cb) { uint32_t k; int16_t *alpha = s->alpha; uint32_t i; alpha[0] = 0; for (i = 1; i < 8; i++) { alpha[i] = -INF; } /* Define the shuffle constant for the positive alpha */ __m128i shuf_ap = _mm_set_epi8( 15, 14, // 7 9, 8, // 4 7, 6, // 3 1, 0, // 0 13, 12, // 6 11, 10, // 5 5, 4, // 2 3, 2 // 1 ); /* Define the shuffle constant for the negative alpha */ __m128i shuf_an = _mm_set_epi8( 13, 12, // 6 11, 10, // 5 5, 4, // 2 3, 2, // 1 15, 14, // 7 9, 8, // 4 7, 6, // 3 1, 0 // 0 ); /* Define shuffle for branch costs */ __m128i shuf_g[4]; shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); __m128i* alphaPtr = (__m128i*) alpha; alphaPtr++; __m128i gv; __m128i *gPtr = (__m128i*) s->branch; __m128i g, ap, an; __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); /* This defines a alpha computation step: * Adds and substracts the branch metrics to the previous alpha step, * shuffles the states according to the trellis path and selects maximum state */ #define ALPHA_STEP(c) g = _mm_shuffle_epi8(gv, shuf_g[c]); \ ap = _mm_add_epi16(alpha_k, g);\ an = _mm_sub_epi16(alpha_k, g);\ ap = _mm_shuffle_epi8(ap, shuf_ap);\ an = _mm_shuffle_epi8(an, shuf_an);\ alpha_k = _mm_max_epi16(ap, an);\ _mm_store_si128(alphaPtr, alpha_k);\ alphaPtr++; \ /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ __m128i norm; for (k = 0; k < long_cb/8; k++) { gv = _mm_load_si128(gPtr); gPtr++; ALPHA_STEP(0); ALPHA_STEP(1); ALPHA_STEP(2); ALPHA_STEP(3); norm = _mm_shuffle_epi8(alpha_k, shuf_norm); alpha_k = _mm_sub_epi16(alpha_k, norm); gv = _mm_load_si128(gPtr); gPtr++; ALPHA_STEP(0); ALPHA_STEP(1); ALPHA_STEP(2); ALPHA_STEP(3); norm = _mm_shuffle_epi8(alpha_k, shuf_norm); alpha_k = _mm_sub_epi16(alpha_k, norm); } } /* Compute branch metrics (gamma) */ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) { __m128i res10, res20, res11, res21, res1, res2; __m128i in, ap, pa, g1, g0; __m128i *inPtr = (__m128i*) input; __m128i *appPtr = (__m128i*) app; __m128i *paPtr = (__m128i*) parity; __m128i *resPtr = (__m128i*) h->branch; __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;ibranch[2*i] = (input[i] - parity[i])/2; h->branch[2*i+1] = (input[i] + parity[i])/2; } } /* Inititalizes constituent decoder object */ int map_gen_init(map_gen_t * h, int max_long_cb) { bzero(h, sizeof(map_gen_t)); h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); if (!h->alpha) { perror("srslte_vec_malloc"); return -1; } h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); if (!h->branch) { perror("srslte_vec_malloc"); return -1; } h->max_long_cb = max_long_cb; return 0; } void map_gen_free(map_gen_t * h) { if (h->alpha) { free(h->alpha); } if (h->branch) { free(h->branch); } bzero(h, sizeof(map_gen_t)); } /* Runs one instance of a decoder */ void map_gen_dec(map_gen_t * h, int16_t * input, int16_t *app, int16_t * parity, int16_t * output, uint32_t long_cb) { // Compute branch metrics map_gen_gamma(h, input, app, parity, long_cb); // Forward recursion map_gen_alpha(h, long_cb); // Backwards recursion + LLR computation map_gen_beta(h, output, long_cb); } /* Initializes the turbo decoder object */ int srslte_tdec_sse_init(srslte_tdec_sse_t * h, uint32_t max_long_cb) { int ret = -1; bzero(h, sizeof(srslte_tdec_sse_t)); uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; h->max_long_cb = max_long_cb; h->app1 = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->app1) { perror("srslte_vec_malloc"); goto clean_and_exit; } h->app2 = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->app2) { perror("srslte_vec_malloc"); goto clean_and_exit; } h->ext1 = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->ext1) { perror("srslte_vec_malloc"); goto clean_and_exit; } h->ext2 = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->ext2) { perror("srslte_vec_malloc"); goto clean_and_exit; } h->syst = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->syst) { perror("srslte_vec_malloc"); goto clean_and_exit; } h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->parity0) { perror("srslte_vec_malloc"); goto clean_and_exit; } h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->parity1) { perror("srslte_vec_malloc"); goto clean_and_exit; } if (map_gen_init(&h->dec, h->max_long_cb)) { goto clean_and_exit; } for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { goto clean_and_exit; } srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); } h->current_cbidx = -1; ret = 0; clean_and_exit:if (ret == -1) { srslte_tdec_sse_free(h); } return ret; } void srslte_tdec_sse_free(srslte_tdec_sse_t * h) { if (h->app1) { free(h->app1); } if (h->app2) { free(h->app2); } if (h->ext1) { free(h->ext1); } if (h->ext2) { free(h->ext2); } if (h->syst) { free(h->syst); } if (h->parity0) { free(h->parity0); } if (h->parity1) { free(h->parity1); } map_gen_free(&h->dec); for (int i=0;iinterleaver[i]); } bzero(h, sizeof(srslte_tdec_sse_t)); } /* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into * 3 buffers ready to be used by compute_gamma() */ void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) { uint32_t i; __m128i *inputPtr = (__m128i*) input; __m128i in0, in1, in2; __m128i s0, s1, s2, s; __m128i p00, p01, p02, p0; __m128i p10, p11, p12, p1; __m128i *sysPtr = (__m128i*) h->syst; __m128i *pa0Ptr = (__m128i*) h->parity0; __m128i *pa1Ptr = (__m128i*) h->parity1; // pick bits 0, 3, 6 from 1st word __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); // pick bits 1, 4, 7 from 2st word __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 2, 5 from 3rd word __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 1, 4, 7 from 1st word __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); // pick bits 2, 5, from 2st word __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 0, 3, 6 from 3rd word __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 2, 5 from 1st word __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); // pick bits 0, 3, 6, from 2st word __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); // pick bits 1, 4, 7 from 3rd word __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // Split systematic and parity bits for (i = 0; i < long_cb/8; i++) { in0 = _mm_load_si128(inputPtr); inputPtr++; in1 = _mm_load_si128(inputPtr); inputPtr++; in2 = _mm_load_si128(inputPtr); inputPtr++; /* Deinterleave Systematic bits */ s0 = _mm_shuffle_epi8(in0, s0_mask); s1 = _mm_shuffle_epi8(in1, s1_mask); s2 = _mm_shuffle_epi8(in2, s2_mask); s = _mm_or_si128(s0, s1); s = _mm_or_si128(s, s2); _mm_store_si128(sysPtr, s); sysPtr++; /* Deinterleave parity 0 bits */ p00 = _mm_shuffle_epi8(in0, p00_mask); p01 = _mm_shuffle_epi8(in1, p01_mask); p02 = _mm_shuffle_epi8(in2, p02_mask); p0 = _mm_or_si128(p00, p01); p0 = _mm_or_si128(p0, p02); _mm_store_si128(pa0Ptr, p0); pa0Ptr++; /* Deinterleave parity 1 bits */ p10 = _mm_shuffle_epi8(in0, p10_mask); p11 = _mm_shuffle_epi8(in1, p11_mask); p12 = _mm_shuffle_epi8(in2, p12_mask); p1 = _mm_or_si128(p10, p11); p1 = _mm_or_si128(p1, p12); _mm_store_si128(pa1Ptr, p1); pa1Ptr++; } for (i = 0; i < 3; i++) { h->syst[i+long_cb] = input[3*long_cb + 2*i]; h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1]; } for (i = 0; i < 3; i++) { h->app2[i+long_cb] = input[3*long_cb + 6 + 2*i]; h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; } } /* Runs 1 turbo decoder iteration */ void srslte_tdec_sse_iteration(srslte_tdec_sse_t * h, int16_t * input, uint32_t long_cb) { if (h->current_cbidx >= 0) { uint16_t *inter = h->interleaver[h->current_cbidx].forward; uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; if (h->n_iter == 0) { deinterleave_input(h, input, long_cb); } // Add apriori information to decoder 1 if (h->n_iter > 0) { srslte_vec_sub_sss(h->app1, h->ext1, h->app1, long_cb); } // Run MAP DEC #1 if (h->n_iter == 0) { map_gen_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, long_cb); } else { map_gen_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, long_cb); } // Convert aposteriori information into extrinsic information if (h->n_iter > 0) { srslte_vec_sub_sss(h->ext1, h->app1, h->ext1, long_cb); } // Interleave extrinsic output of DEC1 to form apriori info for decoder 2 srslte_vec_lut_sss(h->ext1, deinter, h->app2, long_cb); // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits map_gen_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, long_cb); // Deinterleaved extrinsic bits become apriori info for decoder 1 srslte_vec_lut_sss(h->ext2, inter, h->app1, long_cb); h->n_iter++; } else { fprintf(stderr, "Error CB index not set (call srslte_tdec_sse_reset() first\n"); } } /* Resets the decoder and sets the codeblock length */ int srslte_tdec_sse_reset(srslte_tdec_sse_t * h, uint32_t long_cb) { if (long_cb > h->max_long_cb) { fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", h->max_long_cb); return -1; } h->n_iter = 0; h->current_cbidx = srslte_cbsegm_cbindex(long_cb); if (h->current_cbidx < 0) { fprintf(stderr, "Invalid CB length %d\n", long_cb); return -1; } return 0; } void srslte_tdec_sse_decision(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb) { __m128i zero = _mm_set1_epi16(0); __m128i lsb_mask = _mm_set1_epi16(1); __m128i *appPtr = (__m128i*) h->app1; __m128i *outPtr = (__m128i*) output; __m128i ap, out, out0, out1; for (uint32_t i = 0; i < long_cb/16; i++) { ap = _mm_load_si128(appPtr); appPtr++; out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); ap = _mm_load_si128(appPtr); appPtr++; out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); out = _mm_packs_epi16(out0, out1); _mm_store_si128(outPtr, out); outPtr++; } if (long_cb%16) { for (int i=0;i<8;i++) { output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0; } } } void srslte_tdec_sse_decision_byte(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb) { uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; // long_cb is always byte aligned for (uint32_t i = 0; i < long_cb/8; i++) { uint8_t out0 = h->app1[8*i+0]>0?mask[0]:0; uint8_t out1 = h->app1[8*i+1]>0?mask[1]:0; uint8_t out2 = h->app1[8*i+2]>0?mask[2]:0; uint8_t out3 = h->app1[8*i+3]>0?mask[3]:0; uint8_t out4 = h->app1[8*i+4]>0?mask[4]:0; uint8_t out5 = h->app1[8*i+5]>0?mask[5]:0; uint8_t out6 = h->app1[8*i+6]>0?mask[6]:0; uint8_t out7 = h->app1[8*i+7]>0?mask[7]:0; output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; } } /* Runs nof_iterations iterations and decides the output bits */ int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb) { if (srslte_tdec_sse_reset(h, long_cb)) { return SRSLTE_ERROR; } do { srslte_tdec_sse_iteration(h, input, long_cb); } while (h->n_iter < nof_iterations); srslte_tdec_sse_decision_byte(h, output, long_cb); return SRSLTE_SUCCESS; } #endif