mirror of https://github.com/PentHertz/srsLTE.git
Integrated AVX2 decoder in PDSCH object. Added inter-frame SSE decoder (not working and not integrated)
This commit is contained in:
parent
c1ef9da32a
commit
f00ea8c8ed
|
@ -53,6 +53,8 @@
|
|||
#define SRSLTE_MAX_LAYERS 4
|
||||
#define SRSLTE_MAX_CODEWORDS 2
|
||||
|
||||
#define SRSLTE_MAX_CODEBLOCKS 32
|
||||
|
||||
#define SRSLTE_LTE_CRC24A 0x1864CFB
|
||||
#define SRSLTE_LTE_CRC24B 0X1800063
|
||||
#define SRSLTE_LTE_CRC16 0x11021
|
||||
|
|
|
@ -52,12 +52,14 @@
|
|||
#include "srslte/phy/fec/turbodecoder_gen.h"
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include "srslte/phy/fec/turbodecoder_sse.h"
|
||||
#include "srslte/phy/fec/turbodecoder_simd.h"
|
||||
#else
|
||||
#define SRSLTE_TDEC_NPAR 1
|
||||
#endif
|
||||
|
||||
typedef struct SRSLTE_API {
|
||||
#ifdef LV_HAVE_SSE
|
||||
srslte_tdec_sse_t tdec_sse;
|
||||
srslte_tdec_simd_t tdec_simd;
|
||||
#else
|
||||
float *input_conv;
|
||||
srslte_tdec_gen_t tdec_gen;
|
||||
|
@ -69,7 +71,16 @@ SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h,
|
|||
|
||||
SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h);
|
||||
|
||||
SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb);
|
||||
SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API int srslte_tdec_reset_cb(srslte_tdec_t * h,
|
||||
uint32_t cb_idx);
|
||||
|
||||
SRSLTE_API int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h,
|
||||
uint32_t cb_idx);
|
||||
|
||||
SRSLTE_API int srslte_tdec_get_nof_parallel(srslte_tdec_t * h);
|
||||
|
||||
SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h,
|
||||
int16_t* input,
|
||||
|
@ -89,4 +100,31 @@ SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h,
|
|||
uint32_t nof_iterations,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_iteration_par(srslte_tdec_t * h,
|
||||
int16_t* input[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_decision_par(srslte_tdec_t * h,
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_decision_byte_par(srslte_tdec_t * h,
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h,
|
||||
uint8_t *output,
|
||||
uint32_t cb_idx,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API int srslte_tdec_run_all_par(srslte_tdec_t * h,
|
||||
int16_t * input[SRSLTE_TDEC_NPAR],
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_iterations,
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -66,6 +66,8 @@ typedef struct SRSLTE_API {
|
|||
float *parity;
|
||||
|
||||
int current_cbidx;
|
||||
uint32_t current_cb_len;
|
||||
uint32_t n_iter;
|
||||
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
|
||||
} srslte_tdec_gen_t;
|
||||
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
/**
|
||||
*
|
||||
* \section COPYRIGHT
|
||||
*
|
||||
* Copyright 2013-2015 Software Radio Systems Limited
|
||||
*
|
||||
* \section LICENSE
|
||||
*
|
||||
* This file is part of the srsLTE library.
|
||||
*
|
||||
* srsLTE is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* srsLTE is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* A copy of the GNU Affero General Public License can be found in
|
||||
* the LICENSE file in the top-level directory of this distribution
|
||||
* and at http://www.gnu.org/licenses/.
|
||||
*
|
||||
*/
|
||||
|
||||
/**********************************************************************************************
|
||||
* File: turbodecoder.h
|
||||
*
|
||||
* Description: Turbo Decoder.
|
||||
* Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
|
||||
* encoders and one turbo code internal interleaver. The coding rate of turbo
|
||||
* encoder is 1/3.
|
||||
* MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
|
||||
*
|
||||
* Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
|
||||
*********************************************************************************************/
|
||||
|
||||
#ifndef TURBODECODER_SSE_INTER_
|
||||
#define TURBODECODER_SSE_INTER_
|
||||
|
||||
|
||||
/** This is an simd inter-frame parallel turbo decoder. Parallizes 8 code-blocks using SSE
|
||||
* This implementation is currently not functional and not used by the rest of the code
|
||||
*/
|
||||
|
||||
#include "srslte/config.h"
|
||||
#include "srslte/phy/fec/tc_interl.h"
|
||||
#include "srslte/phy/fec/cbsegm.h"
|
||||
|
||||
#if LV_HAVE_AVX2
|
||||
#define SRSLTE_TDEC_NPAR 16
|
||||
#else
|
||||
#define SRSLTE_TDEC_NPAR 8
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct SRSLTE_API {
|
||||
int max_long_cb;
|
||||
|
||||
int16_t *syst0;
|
||||
int16_t *parity0;
|
||||
int16_t *syst1;
|
||||
int16_t *parity1;
|
||||
int16_t *llr1;
|
||||
int16_t *llr2;
|
||||
int16_t *w;
|
||||
int16_t *alpha;
|
||||
|
||||
uint32_t max_par_cb;
|
||||
int current_cbidx;
|
||||
uint32_t current_long_cb;
|
||||
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
|
||||
int n_iter[SRSLTE_TDEC_NPAR];
|
||||
} srslte_tdec_simd_inter_t;
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t max_par_cb,
|
||||
uint32_t max_long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_get_nof_iterations_cb(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t cb_idx);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t cb_idx);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h,
|
||||
int16_t * input[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h,
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h,
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h,
|
||||
uint8_t *output,
|
||||
uint32_t cbidx,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h,
|
||||
int16_t *input[SRSLTE_TDEC_NPAR],
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_iterations,
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
#endif
|
|
@ -257,13 +257,10 @@ int main(int argc, char **argv) {
|
|||
int16_t *input[SRSLTE_TDEC_NPAR];
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR];
|
||||
|
||||
input[0] = llr_s;
|
||||
if (SRSLTE_TDEC_NPAR == 2)
|
||||
input[1] = llr_s;
|
||||
|
||||
output[0] = data_rx_bytes[0];
|
||||
if (SRSLTE_TDEC_NPAR == 2)
|
||||
output[1] = data_rx_bytes[1];
|
||||
for (int n=0;n<SRSLTE_TDEC_NPAR;n++) {
|
||||
input[n] = llr_s;
|
||||
output[n] = data_rx_bytes[n];
|
||||
}
|
||||
|
||||
gettimeofday(&tdata[1], NULL);
|
||||
for (int k=0;k<nof_repetitions;k++) {
|
||||
|
@ -284,7 +281,7 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
printf("Eb/No: %2.2f %10d/%d ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
|
||||
printf("BER: %.2e ", (float) errors / (nof_cb*frame_cnt * frame_length));
|
||||
printf("%3.1f Mbps (%6.2f usec)", (float) SRSLTE_TDEC_NPAR*frame_length / mean_usec, mean_usec);
|
||||
printf("%3.1f Mbps (%6.2f usec)", (float) (nof_cb*frame_length) / mean_usec, mean_usec);
|
||||
printf("\r");
|
||||
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@
|
|||
|
||||
int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
return srslte_tdec_simd_init(&h->tdec_simd, max_long_cb);
|
||||
return srslte_tdec_simd_init(&h->tdec_simd, SRSLTE_TDEC_NPAR, max_long_cb);
|
||||
#else
|
||||
h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12));
|
||||
if (!h->input_conv) {
|
||||
|
@ -56,7 +56,7 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
|
|||
|
||||
void srslte_tdec_free(srslte_tdec_t * h) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
srslte_tdec_simd_free(&h->tdec_simd);
|
||||
srslte_tdec_simd_free(&h->tdec_simd);
|
||||
#else
|
||||
if (h->input_conv) {
|
||||
free(h->input_conv);
|
||||
|
@ -74,9 +74,26 @@ int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int srslte_tdec_reset_cb(srslte_tdec_t * h, uint32_t cb_idx) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
return srslte_tdec_simd_reset_cb(&h->tdec_simd, cb_idx);
|
||||
#else
|
||||
return srslte_tdec_gen_reset(&h->tdec_gen, h->tdec_gen.current_cb_len);
|
||||
#endif
|
||||
}
|
||||
|
||||
int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, uint32_t cb_idx)
|
||||
{
|
||||
#ifdef LV_HAVE_SSE
|
||||
return srslte_tdec_simd_get_nof_iterations_cb(&h->tdec_simd, cb_idx);
|
||||
#else
|
||||
return h->tdec_gen.n_iter;
|
||||
#endif
|
||||
}
|
||||
|
||||
void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb);
|
||||
srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb);
|
||||
#else
|
||||
srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
|
||||
srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb);
|
||||
|
@ -105,12 +122,20 @@ void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb)
|
|||
|
||||
void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb);
|
||||
srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb);
|
||||
#else
|
||||
srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb);
|
||||
#endif
|
||||
}
|
||||
|
||||
void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
srslte_tdec_simd_decision_byte_cb(&h->tdec_simd, output, cb_idx, long_cb);
|
||||
#else
|
||||
srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb);
|
||||
#endif
|
||||
}
|
||||
|
||||
void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
|
||||
uint8_t *output_par[SRSLTE_TDEC_NPAR];
|
||||
output_par[0] = output;
|
||||
|
@ -121,7 +146,7 @@ int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_NPAR]
|
|||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb) {
|
||||
#ifdef LV_HAVE_SSE
|
||||
return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb);
|
||||
return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb);
|
||||
#else
|
||||
srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
|
||||
return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb);
|
||||
|
|
|
@ -153,8 +153,6 @@ void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t lo
|
|||
__m256i gv;
|
||||
int16_t *b = &s->branch[2*NCB*long_cb-16];
|
||||
__m256i *gPtr = (__m256i*) b;
|
||||
|
||||
__m256i bn2, bp2;
|
||||
|
||||
/* This defines a beta computation step:
|
||||
* Adds and substracts the branch metrics to the previous beta step,
|
||||
|
@ -175,10 +173,10 @@ void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t lo
|
|||
alphaPtr--;\
|
||||
bp = _mm256_add_epi16(bp, alpha_k);\
|
||||
bn = _mm256_add_epi16(bn, alpha_k);\
|
||||
bn2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bn);\
|
||||
bp2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bp);\
|
||||
output[0][k-d] = hMax0(bn2) - hMax0(bp2);\
|
||||
output[1][k-d] = hMax1(bn2) - hMax1(bp2);
|
||||
bn = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bn);\
|
||||
bp = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bp);\
|
||||
output[0][k-d] = hMax0(bn) - hMax0(bp);\
|
||||
output[1][k-d] = hMax1(bn) - hMax1(bp);
|
||||
|
||||
/* The tail does not require to load alpha or produce outputs. Only update
|
||||
* beta metrics accordingly */
|
||||
|
@ -309,7 +307,7 @@ void map_avx_alpha(map_gen_t * s, uint32_t long_cb)
|
|||
an = _mm256_shuffle_epi8(an, shuf_an);\
|
||||
alpha_k = _mm256_max_epi16(ap, an);\
|
||||
_mm256_store_si256(alphaPtr, alpha_k);\
|
||||
alphaPtr++; \
|
||||
alphaPtr++;\
|
||||
|
||||
|
||||
/* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
|
||||
|
@ -335,15 +333,62 @@ void map_avx_alpha(map_gen_t * s, uint32_t long_cb)
|
|||
}
|
||||
}
|
||||
|
||||
/* Compute branch metrics (gamma) */
|
||||
void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb)
|
||||
void map_sse_gamma_single(int16_t *output, int16_t *input, int16_t *app, int16_t *parity)
|
||||
{
|
||||
__m128i res10, res20, res11, res21, res1, res2;
|
||||
__m128i res00, res10, res01, res11, res0, res1;
|
||||
__m128i in, ap, pa, g1, g0;
|
||||
|
||||
__m128i *inPtr = (__m128i*) input;
|
||||
__m128i *appPtr = (__m128i*) app;
|
||||
__m128i *paPtr = (__m128i*) parity;
|
||||
__m128i *resPtr = (__m128i*) output;
|
||||
|
||||
__m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
|
||||
__m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
|
||||
__m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
|
||||
__m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
|
||||
|
||||
in = _mm_load_si128(inPtr);
|
||||
inPtr++;
|
||||
pa = _mm_load_si128(paPtr);
|
||||
paPtr++;
|
||||
|
||||
if (appPtr) {
|
||||
ap = _mm_load_si128(appPtr);
|
||||
appPtr++;
|
||||
in = _mm_add_epi16(ap, in);
|
||||
}
|
||||
|
||||
g1 = _mm_add_epi16(in, pa);
|
||||
g0 = _mm_sub_epi16(in, pa);
|
||||
|
||||
g1 = _mm_srai_epi16(g1, 1);
|
||||
g0 = _mm_srai_epi16(g0, 1);
|
||||
|
||||
res00 = _mm_shuffle_epi8(g0, res00_mask);
|
||||
res10 = _mm_shuffle_epi8(g0, res10_mask);
|
||||
res01 = _mm_shuffle_epi8(g1, res01_mask);
|
||||
res11 = _mm_shuffle_epi8(g1, res11_mask);
|
||||
|
||||
res0 = _mm_or_si128(res00, res01);
|
||||
res1 = _mm_or_si128(res10, res11);
|
||||
|
||||
_mm_store_si128(resPtr, res0);
|
||||
resPtr++;
|
||||
_mm_store_si128(resPtr, res1);
|
||||
resPtr++;
|
||||
}
|
||||
|
||||
|
||||
/* Compute branch metrics (gamma) */
|
||||
void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb)
|
||||
{
|
||||
__m128i res10, res20, res11, res21, res1, res2;
|
||||
__m256i in, ap, pa, g1, g0;
|
||||
|
||||
__m256i *inPtr = (__m256i*) input;
|
||||
__m256i *appPtr = (__m256i*) app;
|
||||
__m256i *paPtr = (__m256i*) parity;
|
||||
__m128i *resPtr = (__m128i*) h->branch;
|
||||
|
||||
if (cbidx) {
|
||||
|
@ -351,32 +396,37 @@ void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
|
|||
}
|
||||
|
||||
__m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
|
||||
__m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
|
||||
__m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
|
||||
|
||||
__m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
|
||||
__m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
|
||||
|
||||
for (int i=0;i<long_cb/8;i++) {
|
||||
in = _mm_load_si128(inPtr);
|
||||
for (int i=0;i<long_cb/16;i++) {
|
||||
in = _mm256_load_si256(inPtr);
|
||||
inPtr++;
|
||||
pa = _mm_load_si128(paPtr);
|
||||
pa = _mm256_load_si256(paPtr);
|
||||
paPtr++;
|
||||
|
||||
if (appPtr) {
|
||||
ap = _mm_load_si128(appPtr);
|
||||
ap = _mm256_load_si256(appPtr);
|
||||
appPtr++;
|
||||
in = _mm_add_epi16(ap, in);
|
||||
in = _mm256_add_epi16(ap, in);
|
||||
}
|
||||
|
||||
g1 = _mm_add_epi16(in, pa);
|
||||
g0 = _mm_sub_epi16(in, pa);
|
||||
|
||||
g1 = _mm_srai_epi16(g1, 1);
|
||||
g0 = _mm_srai_epi16(g0, 1);
|
||||
g0 = _mm256_sub_epi16(in, pa);
|
||||
g1 = _mm256_add_epi16(in, pa);
|
||||
|
||||
g0 = _mm256_srai_epi16(g0, 1);
|
||||
g1 = _mm256_srai_epi16(g1, 1);
|
||||
|
||||
res10 = _mm_shuffle_epi8(g0, res10_mask);
|
||||
res20 = _mm_shuffle_epi8(g0, res20_mask);
|
||||
res11 = _mm_shuffle_epi8(g1, res11_mask);
|
||||
res21 = _mm_shuffle_epi8(g1, res21_mask);
|
||||
__m128i g0_t = _mm256_extractf128_si256(g0, 0);
|
||||
__m128i g1_t = _mm256_extractf128_si256(g1, 0);
|
||||
|
||||
res10 = _mm_shuffle_epi8(g0_t, res10_mask);
|
||||
res11 = _mm_shuffle_epi8(g1_t, res11_mask);
|
||||
|
||||
res20 = _mm_shuffle_epi8(g0_t, res20_mask);
|
||||
res21 = _mm_shuffle_epi8(g1_t, res21_mask);
|
||||
|
||||
res1 = _mm_or_si128(res10, res11);
|
||||
res2 = _mm_or_si128(res20, res21);
|
||||
|
@ -386,7 +436,31 @@ void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
|
|||
resPtr++;
|
||||
_mm_store_si128(resPtr, res2);
|
||||
resPtr++;
|
||||
resPtr++;
|
||||
resPtr++;
|
||||
|
||||
g0_t = _mm256_extractf128_si256(g0, 1);
|
||||
g1_t = _mm256_extractf128_si256(g1, 1);
|
||||
|
||||
res10 = _mm_shuffle_epi8(g0_t, res10_mask);
|
||||
res11 = _mm_shuffle_epi8(g1_t, res11_mask);
|
||||
|
||||
res20 = _mm_shuffle_epi8(g0_t, res20_mask);
|
||||
res21 = _mm_shuffle_epi8(g1_t, res21_mask);
|
||||
|
||||
res1 = _mm_or_si128(res10, res11);
|
||||
res2 = _mm_or_si128(res20, res21);
|
||||
|
||||
_mm_store_si128(resPtr, res1);
|
||||
resPtr++;
|
||||
resPtr++;
|
||||
_mm_store_si128(resPtr, res2);
|
||||
resPtr++;
|
||||
resPtr++;
|
||||
|
||||
}
|
||||
|
||||
if (long_cb%16) {
|
||||
map_sse_gamma_single((int16_t*) resPtr, (int16_t*) appPtr, (int16_t*) inPtr, (int16_t*) paPtr);
|
||||
}
|
||||
|
||||
for (int i=long_cb;i<long_cb+3;i++) {
|
||||
|
|
|
@ -96,20 +96,23 @@ void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity
|
|||
}
|
||||
|
||||
/* Inititalizes constituent decoder object */
|
||||
int map_simd_init(map_gen_t * h, int max_long_cb)
|
||||
int map_simd_init(map_gen_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
|
||||
{
|
||||
bzero(h, sizeof(map_gen_t));
|
||||
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR);
|
||||
|
||||
h->max_par_cb = max_par_cb;
|
||||
h->max_long_cb = max_long_cb;
|
||||
|
||||
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb);
|
||||
if (!h->alpha) {
|
||||
perror("srslte_vec_malloc");
|
||||
return -1;
|
||||
}
|
||||
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR);
|
||||
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb);
|
||||
if (!h->branch) {
|
||||
perror("srslte_vec_malloc");
|
||||
return -1;
|
||||
}
|
||||
h->max_long_cb = max_long_cb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -142,15 +145,16 @@ void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_NPAR], int16_t *app
|
|||
}
|
||||
|
||||
/* Initializes the turbo decoder object */
|
||||
int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_long_cb)
|
||||
int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
|
||||
{
|
||||
int ret = -1;
|
||||
bzero(h, sizeof(srslte_tdec_simd_t));
|
||||
uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
|
||||
|
||||
h->max_long_cb = max_long_cb;
|
||||
|
||||
for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
|
||||
h->max_par_cb = max_par_cb;
|
||||
|
||||
for (int i=0;i<h->max_par_cb;i++) {
|
||||
h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->app1[i]) {
|
||||
perror("srslte_vec_malloc");
|
||||
|
@ -189,7 +193,7 @@ int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_long_cb)
|
|||
|
||||
}
|
||||
|
||||
if (map_simd_init(&h->dec, h->max_long_cb)) {
|
||||
if (map_simd_init(&h->dec, h->max_par_cb, h->max_long_cb)) {
|
||||
goto clean_and_exit;
|
||||
}
|
||||
|
||||
|
@ -209,7 +213,7 @@ clean_and_exit:if (ret == -1) {
|
|||
|
||||
void srslte_tdec_simd_free(srslte_tdec_simd_t * h)
|
||||
{
|
||||
for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
|
||||
for (int i=0;i<h->max_par_cb;i++) {
|
||||
if (h->app1[i]) {
|
||||
free(h->app1[i]);
|
||||
}
|
||||
|
@ -333,33 +337,34 @@ void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbi
|
|||
void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
|
||||
int16_t *tmp_app[SRSLTE_TDEC_NPAR];
|
||||
|
||||
if (h->current_cbidx >= 0) {
|
||||
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
|
||||
if (h->n_iter == 0) {
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
if (h->n_iter[i] == 0) {
|
||||
deinterleave_input_simd(h, input[i], i, long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
// Add apriori information to decoder 1
|
||||
if (h->n_iter > 0) {
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
if (h->n_iter[i] > 0) {
|
||||
srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
// Run MAP DEC #1
|
||||
if (h->n_iter == 0) {
|
||||
map_simd_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, nof_cb, long_cb);
|
||||
} else {
|
||||
map_simd_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, nof_cb, long_cb);
|
||||
for (int i=0;i<h->max_par_cb;i++) {
|
||||
tmp_app[i] = h->n_iter[i]?h->app1[i]:NULL;
|
||||
}
|
||||
map_simd_dec(&h->dec, h->syst, tmp_app, h->parity0, h->ext1, nof_cb, long_cb);
|
||||
|
||||
// Convert aposteriori information into extrinsic information
|
||||
if (h->n_iter > 0) {
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
if (h->n_iter[i] > 0) {
|
||||
srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb);
|
||||
}
|
||||
}
|
||||
|
@ -377,7 +382,9 @@ void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_T
|
|||
srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb);
|
||||
}
|
||||
|
||||
h->n_iter++;
|
||||
for (int i=0;i<h->max_par_cb;i++) {
|
||||
h->n_iter[i]++;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n");
|
||||
}
|
||||
|
@ -391,7 +398,9 @@ int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb)
|
|||
h->max_long_cb);
|
||||
return -1;
|
||||
}
|
||||
h->n_iter = 0;
|
||||
for (int i=0;i<h->max_par_cb;i++) {
|
||||
h->n_iter[i] = 0;
|
||||
}
|
||||
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
|
||||
if (h->current_cbidx < 0) {
|
||||
fprintf(stderr, "Invalid CB length %d\n", long_cb);
|
||||
|
@ -400,6 +409,17 @@ int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, uint32_t cb_idx)
|
||||
{
|
||||
h->n_iter[cb_idx] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, uint32_t cb_idx)
|
||||
{
|
||||
return h->n_iter[cb_idx];
|
||||
}
|
||||
|
||||
void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
|
||||
{
|
||||
__m128i zero = _mm_set1_epi16(0);
|
||||
|
@ -433,7 +453,7 @@ void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TD
|
|||
}
|
||||
}
|
||||
|
||||
void tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
|
||||
void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
|
||||
{
|
||||
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
|
||||
|
||||
|
@ -449,17 +469,13 @@ void tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output, uint32_t c
|
|||
uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0;
|
||||
|
||||
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
|
||||
|
||||
//if (i<10) {
|
||||
// printf("output[%d]=%d\n",i,output[i]);
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
tdec_simd_decision_byte(h, output[i], i, long_cb);
|
||||
srslte_tdec_simd_decision_byte_cb(h, output[i], i, long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -474,7 +490,7 @@ int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC
|
|||
|
||||
do {
|
||||
srslte_tdec_simd_iteration(h, input, nof_cb, long_cb);
|
||||
} while (h->n_iter < nof_iterations);
|
||||
} while (h->n_iter[0] < nof_iterations);
|
||||
|
||||
srslte_tdec_simd_decision_byte(h, output, nof_cb, long_cb);
|
||||
|
||||
|
|
|
@ -0,0 +1,299 @@
|
|||
/**
|
||||
*
|
||||
* \section COPYRIGHT
|
||||
*
|
||||
* Copyright 2013-2015 Software Radio Systems Limited
|
||||
*
|
||||
* \section LICENSE
|
||||
*
|
||||
* This file is part of the srsLTE library.
|
||||
*
|
||||
* srsLTE is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* srsLTE is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* A copy of the GNU Affero General Public License can be found in
|
||||
* the LICENSE file in the top-level directory of this distribution
|
||||
* and at http://www.gnu.org/licenses/.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
|
||||
#include "srslte/phy/utils/vector.h"
|
||||
|
||||
#define TOTALTAIL 12
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <smmintrin.h>
|
||||
|
||||
void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb);
|
||||
void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb);
|
||||
void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb);
|
||||
void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb);
|
||||
|
||||
|
||||
static void map_sse_inter_dec(srslte_tdec_simd_inter_t * h, int16_t * input, int16_t * parity, int16_t * output,
|
||||
uint32_t long_cb)
|
||||
{
|
||||
map_see_inter_alpha(h, input, parity, long_cb);
|
||||
map_sse_inter_beta(h, input, parity, output, long_cb);
|
||||
}
|
||||
|
||||
/************************************************
|
||||
*
|
||||
* TURBO DECODER INTERFACE
|
||||
*
|
||||
************************************************/
|
||||
int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
|
||||
{
|
||||
int ret = -1;
|
||||
bzero(h, sizeof(srslte_tdec_simd_inter_t));
|
||||
uint32_t len = max_long_cb + 12;
|
||||
|
||||
h->max_long_cb = max_long_cb;
|
||||
h->max_par_cb = max_par_cb;
|
||||
|
||||
h->llr1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->llr1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->llr2 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->llr2) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->w = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->w) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->syst0) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->syst1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->syst1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->parity0) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->parity1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->alpha = srslte_vec_malloc(sizeof(int16_t) * 8*(len+12) * h->max_par_cb);
|
||||
if (!h->alpha) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
|
||||
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
|
||||
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
|
||||
goto clean_and_exit;
|
||||
}
|
||||
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
|
||||
}
|
||||
h->current_cbidx = -1;
|
||||
ret = 0;
|
||||
clean_and_exit:if (ret == -1) {
|
||||
srslte_tdec_simd_inter_free(h);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h)
|
||||
{
|
||||
if (h->llr1) {
|
||||
free(h->llr1);
|
||||
}
|
||||
if (h->llr2) {
|
||||
free(h->llr2);
|
||||
}
|
||||
if (h->w) {
|
||||
free(h->w);
|
||||
}
|
||||
if (h->syst0) {
|
||||
free(h->syst0);
|
||||
}
|
||||
if (h->syst1) {
|
||||
free(h->syst1);
|
||||
}
|
||||
if (h->parity0) {
|
||||
free(h->parity0);
|
||||
}
|
||||
if (h->parity1) {
|
||||
free(h->parity1);
|
||||
}
|
||||
if (h->alpha) {
|
||||
free(h->alpha);
|
||||
}
|
||||
|
||||
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
|
||||
srslte_tc_interl_free(&h->interleaver[i]);
|
||||
}
|
||||
|
||||
bzero(h, sizeof(srslte_tdec_simd_inter_t));
|
||||
}
|
||||
|
||||
|
||||
/* Deinterleave for inter-frame parallelization */
|
||||
void extract_input(srslte_tdec_simd_inter_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<long_cb;i++) {
|
||||
h->syst0[h->max_par_cb*i+cbidx] = input[3*i+0];
|
||||
h->parity0[h->max_par_cb*i+cbidx] = input[3*i+1];
|
||||
h->parity1[h->max_par_cb*i+cbidx] = input[3*i+2];
|
||||
}
|
||||
for (int i = long_cb; i < long_cb + 3; i++) {
|
||||
h->syst0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
|
||||
h->syst1[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
|
||||
h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 1];
|
||||
h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 2];
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, int16_t *input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
|
||||
if (h->current_cbidx >= 0) {
|
||||
|
||||
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
|
||||
// Prepare systematic and parity bits for MAP DEC #1
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
if (h->n_iter[i] == 0) {
|
||||
extract_input(h, input[i], i, long_cb);
|
||||
}
|
||||
srslte_vec_sum_sss(h->syst0, h->w, h->syst0, long_cb*h->max_par_cb);
|
||||
}
|
||||
|
||||
// Run MAP DEC #1
|
||||
map_sse_inter_dec(h, h->syst0, h->parity0, h->llr1, long_cb);
|
||||
|
||||
// Prepare systematic and parity bits for MAP DEC #1
|
||||
sse_inter_extract_syst1(h, inter, long_cb);
|
||||
|
||||
// Run MAP DEC #2
|
||||
map_sse_inter_dec(h, h->syst1, h->parity1, h->llr2, long_cb);
|
||||
|
||||
// Update a-priori LLR from the last iteration
|
||||
sse_inter_update_w(h, deinter, long_cb);
|
||||
|
||||
} else {
|
||||
fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_inter_reset() first\n");
|
||||
}
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, uint32_t cb_idx)
|
||||
{
|
||||
for (int i=0;i<h->current_long_cb;i++) {
|
||||
h->w[h->max_par_cb*i+cb_idx] = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, uint32_t long_cb)
|
||||
{
|
||||
if (long_cb > h->max_long_cb) {
|
||||
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
|
||||
h->max_long_cb);
|
||||
return -1;
|
||||
}
|
||||
h->current_long_cb = long_cb;
|
||||
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
|
||||
if (h->current_cbidx < 0) {
|
||||
fprintf(stderr, "Invalid CB length %d\n", long_cb);
|
||||
return -1;
|
||||
}
|
||||
memset(h->w, 0, sizeof(int16_t) * long_cb * h->max_par_cb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
|
||||
{
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
uint32_t i;
|
||||
for (i = 0; i < long_cb; i++) {
|
||||
output[i] = (h->llr2[h->max_par_cb*deinter[i]+cb_idx] > 0) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
srslte_tdec_simd_inter_decision_cb(h, output[i], i, long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
|
||||
{
|
||||
uint32_t i;
|
||||
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
|
||||
#define indexOf_cb(idx, cb) (h->max_par_cb*(deinter[8*i+idx])+cb)
|
||||
|
||||
// long_cb is always byte aligned
|
||||
for (i = 0; i < long_cb/8; i++) {
|
||||
uint8_t out0 = h->llr2[indexOf_cb(0, cb_idx)]>0?mask[0]:0;
|
||||
uint8_t out1 = h->llr2[indexOf_cb(1, cb_idx)]>0?mask[1]:0;
|
||||
uint8_t out2 = h->llr2[indexOf_cb(2, cb_idx)]>0?mask[2]:0;
|
||||
uint8_t out3 = h->llr2[indexOf_cb(3, cb_idx)]>0?mask[3]:0;
|
||||
uint8_t out4 = h->llr2[indexOf_cb(4, cb_idx)]>0?mask[4]:0;
|
||||
uint8_t out5 = h->llr2[indexOf_cb(5, cb_idx)]>0?mask[5]:0;
|
||||
uint8_t out6 = h->llr2[indexOf_cb(6, cb_idx)]>0?mask[6]:0;
|
||||
uint8_t out7 = h->llr2[indexOf_cb(7, cb_idx)]>0?mask[7]:0;
|
||||
|
||||
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
srslte_tdec_simd_inter_decision_byte_cb(h, output[i], i, long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h,
|
||||
int16_t *input[SRSLTE_TDEC_NPAR], uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
uint32_t iter = 0;
|
||||
|
||||
if (srslte_tdec_simd_inter_reset(h, long_cb)) {
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
do {
|
||||
srslte_tdec_simd_inter_iteration(h, input, nof_cb, long_cb);
|
||||
iter++;
|
||||
} while (iter < nof_iterations);
|
||||
|
||||
srslte_tdec_simd_inter_decision_byte(h, output, nof_cb, long_cb);
|
||||
|
||||
return SRSLTE_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -31,7 +31,7 @@
|
|||
#include <strings.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "srslte/phy/fec/turbodecoder_sse.h"
|
||||
#include "srslte/phy/fec/turbodecoder_simd.h"
|
||||
#include "srslte/phy/utils/vector.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
|
@ -62,17 +62,20 @@ static void print_128i(__m128i x) {
|
|||
printf("]\n");
|
||||
}
|
||||
*/
|
||||
//#define use_beta_transposed_max
|
||||
|
||||
#ifndef use_beta_transposed_max
|
||||
|
||||
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
|
||||
static inline int16_t hMax(__m128i buffer)
|
||||
{
|
||||
__m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
|
||||
__m128i tmp1 = _mm_sub_epi16(_mm_set1_epi16(0x7FFF), buffer);
|
||||
__m128i tmp3 = _mm_minpos_epu16(tmp1);
|
||||
return (int16_t)(_mm_cvtsi128_si32(tmp3));
|
||||
}
|
||||
|
||||
/* Computes beta values */
|
||||
void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
|
||||
void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
|
||||
{
|
||||
int k;
|
||||
uint32_t end = long_cb + 3;
|
||||
|
@ -138,8 +141,8 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
|
|||
alphaPtr--;\
|
||||
bp = _mm_add_epi16(bp, alpha_k);\
|
||||
bn = _mm_add_epi16(bn, alpha_k);\
|
||||
output[k-d] = hMax(bn) - hMax(bp);
|
||||
|
||||
output[k-d] = hMax(bn)-hMax(bp);
|
||||
|
||||
/* The tail does not require to load alpha or produce outputs. Only update
|
||||
* beta metrics accordingly */
|
||||
for (k=end-1; k>=long_cb; k--) {
|
||||
|
@ -154,6 +157,7 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
|
|||
for (; k >= 0; k-=8) {
|
||||
gv = _mm_load_si128(gPtr);
|
||||
gPtr--;
|
||||
|
||||
BETA_STEP_CNT(0,0);
|
||||
BETA_STEP_CNT(1,1);
|
||||
BETA_STEP_CNT(2,2);
|
||||
|
@ -165,14 +169,17 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
|
|||
BETA_STEP_CNT(0,4);
|
||||
BETA_STEP_CNT(1,5);
|
||||
BETA_STEP_CNT(2,6);
|
||||
BETA_STEP_CNT(3,7);
|
||||
BETA_STEP_CNT(3,7);
|
||||
|
||||
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
|
||||
beta_k = _mm_sub_epi16(beta_k, norm);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Computes alpha metrics */
|
||||
void map_gen_alpha(map_gen_t * s, uint32_t long_cb)
|
||||
void map_sse_alpha(map_gen_t * s, uint32_t long_cb)
|
||||
{
|
||||
uint32_t k;
|
||||
int16_t *alpha = s->alpha;
|
||||
|
@ -261,9 +268,9 @@ void map_gen_alpha(map_gen_t * s, uint32_t long_cb)
|
|||
}
|
||||
|
||||
/* Compute branch metrics (gamma) */
|
||||
void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb)
|
||||
void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb)
|
||||
{
|
||||
__m128i res10, res20, res11, res21, res1, res2;
|
||||
__m128i res00, res10, res01, res11, res0, res1;
|
||||
__m128i in, ap, pa, g1, g0;
|
||||
|
||||
__m128i *inPtr = (__m128i*) input;
|
||||
|
@ -271,10 +278,10 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
|
|||
__m128i *paPtr = (__m128i*) parity;
|
||||
__m128i *resPtr = (__m128i*) h->branch;
|
||||
|
||||
__m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
|
||||
__m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
|
||||
__m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
|
||||
__m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
|
||||
__m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
|
||||
__m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
|
||||
__m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
|
||||
__m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
|
||||
|
||||
for (int i=0;i<long_cb/8;i++) {
|
||||
in = _mm_load_si128(inPtr);
|
||||
|
@ -294,17 +301,17 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
|
|||
g1 = _mm_srai_epi16(g1, 1);
|
||||
g0 = _mm_srai_epi16(g0, 1);
|
||||
|
||||
res00 = _mm_shuffle_epi8(g0, res00_mask);
|
||||
res10 = _mm_shuffle_epi8(g0, res10_mask);
|
||||
res20 = _mm_shuffle_epi8(g0, res20_mask);
|
||||
res01 = _mm_shuffle_epi8(g1, res01_mask);
|
||||
res11 = _mm_shuffle_epi8(g1, res11_mask);
|
||||
res21 = _mm_shuffle_epi8(g1, res21_mask);
|
||||
|
||||
res0 = _mm_or_si128(res00, res01);
|
||||
res1 = _mm_or_si128(res10, res11);
|
||||
res2 = _mm_or_si128(res20, res21);
|
||||
|
||||
_mm_store_si128(resPtr, res1);
|
||||
_mm_store_si128(resPtr, res0);
|
||||
resPtr++;
|
||||
_mm_store_si128(resPtr, res2);
|
||||
_mm_store_si128(resPtr, res1);
|
||||
resPtr++;
|
||||
}
|
||||
|
||||
|
@ -314,356 +321,177 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
|
|||
}
|
||||
}
|
||||
|
||||
/* Inititalizes constituent decoder object */
|
||||
int map_gen_init(map_gen_t * h, int max_long_cb)
|
||||
{
|
||||
bzero(h, sizeof(map_gen_t));
|
||||
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
|
||||
if (!h->alpha) {
|
||||
perror("srslte_vec_malloc");
|
||||
return -1;
|
||||
}
|
||||
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
|
||||
if (!h->branch) {
|
||||
perror("srslte_vec_malloc");
|
||||
return -1;
|
||||
}
|
||||
h->max_long_cb = max_long_cb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void map_gen_free(map_gen_t * h)
|
||||
{
|
||||
if (h->alpha) {
|
||||
free(h->alpha);
|
||||
}
|
||||
if (h->branch) {
|
||||
free(h->branch);
|
||||
}
|
||||
bzero(h, sizeof(map_gen_t));
|
||||
}
|
||||
|
||||
/* Runs one instance of a decoder */
|
||||
void map_gen_dec(map_gen_t * h, int16_t * input, int16_t *app, int16_t * parity, int16_t * output,
|
||||
uint32_t long_cb)
|
||||
{
|
||||
|
||||
// Compute branch metrics
|
||||
map_gen_gamma(h, input, app, parity, long_cb);
|
||||
|
||||
// Forward recursion
|
||||
map_gen_alpha(h, long_cb);
|
||||
|
||||
// Backwards recursion + LLR computation
|
||||
map_gen_beta(h, output, long_cb);
|
||||
|
||||
}
|
||||
|
||||
/* Initializes the turbo decoder object */
|
||||
int srslte_tdec_sse_init(srslte_tdec_sse_t * h, uint32_t max_long_cb)
|
||||
{
|
||||
int ret = -1;
|
||||
bzero(h, sizeof(srslte_tdec_sse_t));
|
||||
uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
|
||||
|
||||
h->max_long_cb = max_long_cb;
|
||||
|
||||
h->app1 = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->app1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->app2 = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->app2) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->ext1 = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->ext1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->ext2 = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->ext2) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->syst = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->syst) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->parity0) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len);
|
||||
if (!h->parity1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
|
||||
if (map_gen_init(&h->dec, h->max_long_cb)) {
|
||||
goto clean_and_exit;
|
||||
}
|
||||
|
||||
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
|
||||
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
|
||||
goto clean_and_exit;
|
||||
}
|
||||
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
|
||||
}
|
||||
h->current_cbidx = -1;
|
||||
ret = 0;
|
||||
clean_and_exit:if (ret == -1) {
|
||||
srslte_tdec_sse_free(h);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void srslte_tdec_sse_free(srslte_tdec_sse_t * h)
|
||||
{
|
||||
if (h->app1) {
|
||||
free(h->app1);
|
||||
}
|
||||
if (h->app2) {
|
||||
free(h->app2);
|
||||
}
|
||||
if (h->ext1) {
|
||||
free(h->ext1);
|
||||
}
|
||||
if (h->ext2) {
|
||||
free(h->ext2);
|
||||
}
|
||||
if (h->syst) {
|
||||
free(h->syst);
|
||||
}
|
||||
if (h->parity0) {
|
||||
free(h->parity0);
|
||||
}
|
||||
if (h->parity1) {
|
||||
free(h->parity1);
|
||||
}
|
||||
|
||||
map_gen_free(&h->dec);
|
||||
|
||||
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
|
||||
srslte_tc_interl_free(&h->interleaver[i]);
|
||||
}
|
||||
|
||||
bzero(h, sizeof(srslte_tdec_sse_t));
|
||||
}
|
||||
|
||||
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into
|
||||
* 3 buffers ready to be used by compute_gamma()
|
||||
/***********************
|
||||
*
|
||||
* This is an attempt to parallelize the horizontal max
|
||||
* by doing a 8x8 tranpose of the vectors and computing max
|
||||
* in cascade. However since we need to store 16 registers
|
||||
* for the positive and negative values the performance is not very good
|
||||
*/
|
||||
void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) {
|
||||
uint32_t i;
|
||||
|
||||
|
||||
#ifdef use_beta_transposed_max
|
||||
|
||||
static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d,
|
||||
__m128i e, __m128i f, __m128i g, __m128i h)
|
||||
{
|
||||
// Transpose 8 vectors
|
||||
__m128i t0 = _mm_unpacklo_epi16(a, b);
|
||||
__m128i t1 = _mm_unpacklo_epi16(c, d);
|
||||
__m128i t2 = _mm_unpacklo_epi16(e, f);
|
||||
__m128i t3 = _mm_unpacklo_epi16(g, h);
|
||||
__m128i t4 = _mm_unpackhi_epi16(a, b);
|
||||
__m128i t5 = _mm_unpackhi_epi16(c, d);
|
||||
__m128i t6 = _mm_unpackhi_epi16(e, f);
|
||||
__m128i t7 = _mm_unpackhi_epi16(g, h);
|
||||
|
||||
__m128i s0 = _mm_unpacklo_epi32(t0, t1);
|
||||
__m128i s1 = _mm_unpackhi_epi32(t0, t1);
|
||||
__m128i s2 = _mm_unpacklo_epi32(t2, t3);
|
||||
__m128i s3 = _mm_unpackhi_epi32(t2, t3);
|
||||
__m128i s4 = _mm_unpacklo_epi32(t4, t5);
|
||||
__m128i s5 = _mm_unpackhi_epi32(t4, t5);
|
||||
__m128i s6 = _mm_unpacklo_epi32(t6, t7);
|
||||
__m128i s7 = _mm_unpackhi_epi32(t6, t7);
|
||||
|
||||
__m128i x0 = _mm_unpacklo_epi64(s0, s2);
|
||||
__m128i x1 = _mm_unpackhi_epi64(s0, s2);
|
||||
__m128i x2 = _mm_unpacklo_epi64(s1, s3);
|
||||
__m128i x3 = _mm_unpackhi_epi64(s1, s3);
|
||||
__m128i x4 = _mm_unpacklo_epi64(s4, s6);
|
||||
__m128i x5 = _mm_unpackhi_epi64(s4, s6);
|
||||
__m128i x6 = _mm_unpacklo_epi64(s5, s7);
|
||||
__m128i x7 = _mm_unpackhi_epi64(s5, s7);
|
||||
|
||||
// Cascade max on the transposed vector
|
||||
__m128i res = _mm_max_epi16(x0,
|
||||
_mm_max_epi16(x1,
|
||||
_mm_max_epi16(x2,
|
||||
_mm_max_epi16(x3,
|
||||
_mm_max_epi16(x4,
|
||||
_mm_max_epi16(x5,
|
||||
_mm_max_epi16(x6,
|
||||
x7)))))));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
|
||||
{
|
||||
int k;
|
||||
uint32_t end = long_cb + 3;
|
||||
const __m128i *alphaPtr = (const __m128i*) s->alpha;
|
||||
|
||||
__m128i *inputPtr = (__m128i*) input;
|
||||
__m128i in0, in1, in2;
|
||||
__m128i s0, s1, s2, s;
|
||||
__m128i p00, p01, p02, p0;
|
||||
__m128i p10, p11, p12, p1;
|
||||
__m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
|
||||
__m128i g, alpha_k;
|
||||
__m128i bn, bn_0, bn_1, bn_2, bn_3, bn_4, bn_5, bn_6, bn_7;
|
||||
__m128i bp, bp_0, bp_1, bp_2, bp_3, bp_4, bp_5, bp_6, bp_7;
|
||||
|
||||
__m128i *sysPtr = (__m128i*) h->syst;
|
||||
__m128i *pa0Ptr = (__m128i*) h->parity0;
|
||||
__m128i *pa1Ptr = (__m128i*) h->parity1;
|
||||
|
||||
// pick bits 0, 3, 6 from 1st word
|
||||
__m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
|
||||
// pick bits 1, 4, 7 from 2st word
|
||||
__m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
|
||||
// pick bits 2, 5 from 3rd word
|
||||
__m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
|
||||
/* Define the shuffle constant for the positive beta */
|
||||
__m128i shuf_bp = _mm_set_epi8(
|
||||
15, 14, // 7
|
||||
7, 6, // 3
|
||||
5, 4, // 2
|
||||
13, 12, // 6
|
||||
11, 10, // 5
|
||||
3, 2, // 1
|
||||
1, 0, // 0
|
||||
9, 8 // 4
|
||||
);
|
||||
|
||||
// pick bits 1, 4, 7 from 1st word
|
||||
__m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
|
||||
// pick bits 2, 5, from 2st word
|
||||
__m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
|
||||
// pick bits 0, 3, 6 from 3rd word
|
||||
__m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
|
||||
/* Define the shuffle constant for the negative beta */
|
||||
__m128i shuf_bn = _mm_set_epi8(
|
||||
7, 6, // 3
|
||||
15, 14, // 7
|
||||
13, 12, // 6
|
||||
5, 4, // 2
|
||||
3, 2, // 1
|
||||
11, 10, // 5
|
||||
9, 8, // 4
|
||||
1, 0 // 0
|
||||
);
|
||||
|
||||
alphaPtr += long_cb-1;
|
||||
|
||||
/* Define shuffle for branch costs */
|
||||
__m128i shuf_g[4];
|
||||
shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
|
||||
shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
|
||||
shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
|
||||
shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
|
||||
__m128i gv;
|
||||
int16_t *b = &s->branch[2*long_cb-8];
|
||||
__m128i *gPtr = (__m128i*) b;
|
||||
/* Define shuffle for beta normalization */
|
||||
__m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
|
||||
|
||||
// pick bits 2, 5 from 1st word
|
||||
__m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
|
||||
// pick bits 0, 3, 6, from 2st word
|
||||
__m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
|
||||
// pick bits 1, 4, 7 from 3rd word
|
||||
__m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
|
||||
|
||||
// Split systematic and parity bits
|
||||
for (i = 0; i < long_cb/8; i++) {
|
||||
|
||||
in0 = _mm_load_si128(inputPtr); inputPtr++;
|
||||
in1 = _mm_load_si128(inputPtr); inputPtr++;
|
||||
in2 = _mm_load_si128(inputPtr); inputPtr++;
|
||||
/* This defines a beta computation step:
|
||||
* Adds and substracts the branch metrics to the previous beta step,
|
||||
* shuffles the states according to the trellis path and selects maximum state
|
||||
*/
|
||||
#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\
|
||||
bn = _mm_sub_epi16(beta_k, g);\
|
||||
bp = _mm_shuffle_epi8(bp, shuf_bp);\
|
||||
bn = _mm_shuffle_epi8(bn, shuf_bn);\
|
||||
beta_k = _mm_max_epi16(bp, bn);
|
||||
|
||||
/* Loads the alpha metrics from memory and adds them to the temporal bn and bp
|
||||
* metrics.
|
||||
*/
|
||||
#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\
|
||||
BETA_STEP(g)\
|
||||
alpha_k = _mm_load_si128(alphaPtr);\
|
||||
alphaPtr--;\
|
||||
bp_##d = _mm_add_epi16(bp, alpha_k);\
|
||||
bn_##d = _mm_add_epi16(bn, alpha_k);\
|
||||
|
||||
/* The tail does not require to load alpha or produce outputs. Only update
|
||||
* beta metrics accordingly */
|
||||
for (k=end-1; k>=long_cb; k--) {
|
||||
int16_t g0 = s->branch[2*k];
|
||||
int16_t g1 = s->branch[2*k+1];
|
||||
g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
|
||||
BETA_STEP(g);
|
||||
}
|
||||
|
||||
/* We inline 2 trelis steps for each normalization */
|
||||
__m128i norm;
|
||||
__m128i *outPtr = (__m128i*) &output[long_cb-8];
|
||||
for (; k >= 0; k-=8) {
|
||||
gv = _mm_load_si128(gPtr);
|
||||
gPtr--;
|
||||
|
||||
/* Deinterleave Systematic bits */
|
||||
s0 = _mm_shuffle_epi8(in0, s0_mask);
|
||||
s1 = _mm_shuffle_epi8(in1, s1_mask);
|
||||
s2 = _mm_shuffle_epi8(in2, s2_mask);
|
||||
s = _mm_or_si128(s0, s1);
|
||||
s = _mm_or_si128(s, s2);
|
||||
|
||||
_mm_store_si128(sysPtr, s);
|
||||
sysPtr++;
|
||||
|
||||
/* Deinterleave parity 0 bits */
|
||||
p00 = _mm_shuffle_epi8(in0, p00_mask);
|
||||
p01 = _mm_shuffle_epi8(in1, p01_mask);
|
||||
p02 = _mm_shuffle_epi8(in2, p02_mask);
|
||||
p0 = _mm_or_si128(p00, p01);
|
||||
p0 = _mm_or_si128(p0, p02);
|
||||
BETA_STEP_CNT(0,0);
|
||||
BETA_STEP_CNT(1,1);
|
||||
BETA_STEP_CNT(2,2);
|
||||
BETA_STEP_CNT(3,3);
|
||||
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
|
||||
beta_k = _mm_sub_epi16(beta_k, norm);
|
||||
gv = _mm_load_si128(gPtr);
|
||||
gPtr--;
|
||||
BETA_STEP_CNT(0,4);
|
||||
BETA_STEP_CNT(1,5);
|
||||
BETA_STEP_CNT(2,6);
|
||||
BETA_STEP_CNT(3,7);
|
||||
norm = _mm_shuffle_epi8(beta_k, shuf_norm);
|
||||
beta_k = _mm_sub_epi16(beta_k, norm);
|
||||
|
||||
_mm_store_si128(pa0Ptr, p0);
|
||||
pa0Ptr++;
|
||||
|
||||
/* Deinterleave parity 1 bits */
|
||||
p10 = _mm_shuffle_epi8(in0, p10_mask);
|
||||
p11 = _mm_shuffle_epi8(in1, p11_mask);
|
||||
p12 = _mm_shuffle_epi8(in2, p12_mask);
|
||||
p1 = _mm_or_si128(p10, p11);
|
||||
p1 = _mm_or_si128(p1, p12);
|
||||
|
||||
_mm_store_si128(pa1Ptr, p1);
|
||||
pa1Ptr++;
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < 3; i++) {
|
||||
h->syst[i+long_cb] = input[3*long_cb + 2*i];
|
||||
h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
|
||||
}
|
||||
for (i = 0; i < 3; i++) {
|
||||
h->app2[i+long_cb] = input[3*long_cb + 6 + 2*i];
|
||||
h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
|
||||
}
|
||||
|
||||
__m128i bn_transp = transposed_max(bn_7, bn_6, bn_5, bn_4, bn_3, bn_2, bn_1, bn_0);
|
||||
__m128i bp_transp = transposed_max(bp_7, bp_6, bp_5, bp_4, bp_3, bp_2, bp_1, bp_0);
|
||||
__m128i outval = _mm_sub_epi16(bp_transp,bn_transp);
|
||||
_mm_store_si128(outPtr, outval);
|
||||
outPtr--;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Runs 1 turbo decoder iteration */
|
||||
void srslte_tdec_sse_iteration(srslte_tdec_sse_t * h, int16_t * input, uint32_t long_cb)
|
||||
{
|
||||
|
||||
if (h->current_cbidx >= 0) {
|
||||
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
|
||||
if (h->n_iter == 0) {
|
||||
deinterleave_input(h, input, long_cb);
|
||||
}
|
||||
|
||||
// Add apriori information to decoder 1
|
||||
if (h->n_iter > 0) {
|
||||
srslte_vec_sub_sss(h->app1, h->ext1, h->app1, long_cb);
|
||||
}
|
||||
|
||||
// Run MAP DEC #1
|
||||
if (h->n_iter == 0) {
|
||||
map_gen_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, long_cb);
|
||||
} else {
|
||||
map_gen_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, long_cb);
|
||||
}
|
||||
|
||||
// Convert aposteriori information into extrinsic information
|
||||
if (h->n_iter > 0) {
|
||||
srslte_vec_sub_sss(h->ext1, h->app1, h->ext1, long_cb);
|
||||
}
|
||||
|
||||
// Interleave extrinsic output of DEC1 to form apriori info for decoder 2
|
||||
srslte_vec_lut_sss(h->ext1, deinter, h->app2, long_cb);
|
||||
|
||||
// Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
|
||||
map_gen_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, long_cb);
|
||||
|
||||
// Deinterleaved extrinsic bits become apriori info for decoder 1
|
||||
srslte_vec_lut_sss(h->ext2, inter, h->app1, long_cb);
|
||||
|
||||
h->n_iter++;
|
||||
} else {
|
||||
fprintf(stderr, "Error CB index not set (call srslte_tdec_sse_reset() first\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* Resets the decoder and sets the codeblock length */
|
||||
int srslte_tdec_sse_reset(srslte_tdec_sse_t * h, uint32_t long_cb)
|
||||
{
|
||||
if (long_cb > h->max_long_cb) {
|
||||
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
|
||||
h->max_long_cb);
|
||||
return -1;
|
||||
}
|
||||
h->n_iter = 0;
|
||||
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
|
||||
if (h->current_cbidx < 0) {
|
||||
fprintf(stderr, "Invalid CB length %d\n", long_cb);
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void srslte_tdec_sse_decision(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb)
|
||||
{
|
||||
__m128i zero = _mm_set1_epi16(0);
|
||||
__m128i lsb_mask = _mm_set1_epi16(1);
|
||||
|
||||
__m128i *appPtr = (__m128i*) h->app1;
|
||||
__m128i *outPtr = (__m128i*) output;
|
||||
__m128i ap, out, out0, out1;
|
||||
|
||||
for (uint32_t i = 0; i < long_cb/16; i++) {
|
||||
ap = _mm_load_si128(appPtr); appPtr++;
|
||||
out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
|
||||
ap = _mm_load_si128(appPtr); appPtr++;
|
||||
out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
|
||||
|
||||
out = _mm_packs_epi16(out0, out1);
|
||||
_mm_store_si128(outPtr, out);
|
||||
outPtr++;
|
||||
}
|
||||
if (long_cb%16) {
|
||||
for (int i=0;i<8;i++) {
|
||||
output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_sse_decision_byte(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb)
|
||||
{
|
||||
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
|
||||
|
||||
// long_cb is always byte aligned
|
||||
for (uint32_t i = 0; i < long_cb/8; i++) {
|
||||
uint8_t out0 = h->app1[8*i+0]>0?mask[0]:0;
|
||||
uint8_t out1 = h->app1[8*i+1]>0?mask[1]:0;
|
||||
uint8_t out2 = h->app1[8*i+2]>0?mask[2]:0;
|
||||
uint8_t out3 = h->app1[8*i+3]>0?mask[3]:0;
|
||||
uint8_t out4 = h->app1[8*i+4]>0?mask[4]:0;
|
||||
uint8_t out5 = h->app1[8*i+5]>0?mask[5]:0;
|
||||
uint8_t out6 = h->app1[8*i+6]>0?mask[6]:0;
|
||||
uint8_t out7 = h->app1[8*i+7]>0?mask[7]:0;
|
||||
|
||||
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
|
||||
}
|
||||
}
|
||||
|
||||
/* Runs nof_iterations iterations and decides the output bits */
|
||||
int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *output,
|
||||
uint32_t nof_iterations, uint32_t long_cb)
|
||||
{
|
||||
if (srslte_tdec_sse_reset(h, long_cb)) {
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
do {
|
||||
srslte_tdec_sse_iteration(h, input, long_cb);
|
||||
} while (h->n_iter < nof_iterations);
|
||||
|
||||
srslte_tdec_sse_decision_byte(h, output, long_cb);
|
||||
|
||||
return SRSLTE_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
/**
|
||||
*
|
||||
* \section COPYRIGHT
|
||||
*
|
||||
* Copyright 2013-2015 Software Radio Systems Limited
|
||||
*
|
||||
* \section LICENSE
|
||||
*
|
||||
* This file is part of the srsLTE library.
|
||||
*
|
||||
* srsLTE is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* srsLTE is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* A copy of the GNU Affero General Public License can be found in
|
||||
* the LICENSE file in the top-level directory of this distribution
|
||||
* and at http://www.gnu.org/licenses/.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
|
||||
#include "srslte/phy/utils/vector.h"
|
||||
|
||||
|
||||
#define NCB 8
|
||||
|
||||
#define INF 10000
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <smmintrin.h>
|
||||
|
||||
void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb)
|
||||
{
|
||||
__m128i *llr1Ptr = (__m128i*) h->llr1;
|
||||
__m128i *wPtr = (__m128i*) h->w;
|
||||
__m128i *syst1Ptr = (__m128i*) h->syst1;
|
||||
|
||||
for (int i = 0; i < long_cb; i++) {
|
||||
__m128i llr1 = _mm_load_si128(&llr1Ptr[inter[i]]);
|
||||
__m128i w = _mm_load_si128(&wPtr[inter[i]]);
|
||||
_mm_store_si128(syst1Ptr++, _mm_sub_epi16(llr1, w));
|
||||
}
|
||||
}
|
||||
|
||||
void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb)
|
||||
{
|
||||
__m128i *llr1Ptr = (__m128i*) h->llr1;
|
||||
__m128i *llr2Ptr = (__m128i*) h->llr2;
|
||||
__m128i *wPtr = (__m128i*) h->w;
|
||||
__m128i *syst1Ptr = (__m128i*) h->syst1;
|
||||
|
||||
for (int i = 0; i < long_cb; i++) {
|
||||
__m128i llr1 = _mm_load_si128(llr1Ptr++);
|
||||
__m128i w = _mm_load_si128(wPtr++);
|
||||
__m128i llr2 = _mm_load_si128(&llr2Ptr[deinter[i]]);
|
||||
|
||||
_mm_store_si128(syst1Ptr++, _mm_add_epi16(w, _mm_sub_epi16(llr2, llr1)));
|
||||
}
|
||||
}
|
||||
|
||||
/* Computes beta values */
|
||||
void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb)
|
||||
{
|
||||
__m128i m_b[8], new[8], old[8], max1[8], max0[8];
|
||||
__m128i x, y, xy;
|
||||
__m128i m1, m0;
|
||||
uint32_t end = long_cb + 3;
|
||||
uint32_t i;
|
||||
|
||||
__m128i *inputPtr = (__m128i*) input;
|
||||
__m128i *parityPtr = (__m128i*) parity;
|
||||
__m128i *outputPtr = (__m128i*) output;
|
||||
__m128i *alphaPtr = (__m128i*) s->alpha;
|
||||
|
||||
for (int k = end - 1; k >= 0; k--) {
|
||||
x = _mm_load_si128(inputPtr++);
|
||||
y = _mm_load_si128(parityPtr++);
|
||||
|
||||
xy = _mm_add_epi16(x,y);
|
||||
|
||||
m_b[0] = _mm_add_epi16(old[4], xy);
|
||||
m_b[1] = old[4];
|
||||
m_b[2] = _mm_add_epi16(old[5], y);
|
||||
m_b[3] = _mm_add_epi16(old[5], x);
|
||||
m_b[4] = _mm_add_epi16(old[6], x);
|
||||
m_b[5] = _mm_add_epi16(old[6], y);
|
||||
m_b[6] = old[7];
|
||||
m_b[7] = _mm_add_epi16(old[7], xy);
|
||||
|
||||
new[0] = old[0];
|
||||
new[1] = _mm_add_epi16(old[0], xy);
|
||||
new[2] = _mm_add_epi16(old[1], x);
|
||||
new[3] = _mm_add_epi16(old[1], y);
|
||||
new[4] = _mm_add_epi16(old[2], y);
|
||||
new[5] = _mm_add_epi16(old[2], x);
|
||||
new[6] = _mm_add_epi16(old[3], xy);
|
||||
new[7] = old[3];
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
__m128i alpha = _mm_load_si128(alphaPtr++);
|
||||
max0[i] = _mm_add_epi16(alpha, m_b[i]);
|
||||
max1[i] = _mm_add_epi16(alpha, new[i]);
|
||||
}
|
||||
|
||||
m1 = _mm_max_epi16(max1[0], max1[1]);
|
||||
m0 = _mm_max_epi16(max0[0], max0[1]);
|
||||
|
||||
for (i = 2; i < 8; i++) {
|
||||
m1 = _mm_max_epi16(m1, max1[i]);
|
||||
m0 = _mm_max_epi16(m0, max0[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
new[i] = _mm_max_epi16(m_b[i], new[i]);
|
||||
old[i] = new[i];
|
||||
}
|
||||
|
||||
__m128i out = _mm_sub_epi16(m1, m0);
|
||||
_mm_store_si128(outputPtr++, out);
|
||||
|
||||
// normalize
|
||||
if ((k%4)==0) {
|
||||
for (int i=1;i<8;i++) {
|
||||
_mm_sub_epi16(old[i], old[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Computes alpha metrics */
|
||||
void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb)
|
||||
{
|
||||
__m128i m_b[8], new[8], old[8];
|
||||
__m128i x, y, xy;
|
||||
uint32_t k;
|
||||
|
||||
__m128i *inputPtr = (__m128i*) input;
|
||||
__m128i *parityPtr = (__m128i*) parity;
|
||||
__m128i *alphaPtr = (__m128i*) s->alpha;
|
||||
|
||||
old[0] = _mm_set1_epi16(0);
|
||||
for (int i = 1; i < 8; i++) {
|
||||
old[i] = _mm_set1_epi16(-INF);
|
||||
}
|
||||
|
||||
for (k = 0; k < long_cb; k++) {
|
||||
x = _mm_load_si128(inputPtr++);
|
||||
y = _mm_load_si128(parityPtr++);
|
||||
|
||||
xy = _mm_add_epi16(x,y);
|
||||
|
||||
m_b[0] = old[0];
|
||||
m_b[1] = _mm_add_epi16(old[3], y);
|
||||
m_b[2] = _mm_add_epi16(old[4], y);
|
||||
m_b[3] = old[7];
|
||||
m_b[4] = old[1];
|
||||
m_b[5] = _mm_add_epi16(old[2], y);
|
||||
m_b[6] = _mm_add_epi16(old[5], y);
|
||||
m_b[7] = old[6];
|
||||
|
||||
new[0] = _mm_add_epi16(old[1], xy);
|
||||
new[1] = _mm_add_epi16(old[2], x);
|
||||
new[2] = _mm_add_epi16(old[5], x);
|
||||
new[3] = _mm_add_epi16(old[6], xy);
|
||||
new[4] = _mm_add_epi16(old[0], xy);
|
||||
new[5] = _mm_add_epi16(old[3], x);
|
||||
new[6] = _mm_add_epi16(old[4], x);
|
||||
new[7] = _mm_add_epi16(old[7], xy);
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
new[i] = _mm_max_epi16(m_b[i], new[i]);
|
||||
old[i] = new[i];
|
||||
_mm_store_si128(alphaPtr++, old[i]);
|
||||
}
|
||||
|
||||
// normalize
|
||||
if ((k%4)==0) {
|
||||
for (int i=1;i<8;i++) {
|
||||
_mm_sub_epi16(old[i], old[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -311,8 +311,124 @@ static int encode_tb(srslte_sch_t *q,
|
|||
return encode_tb_off(q, soft_buffer, cb_segm, Qm, rv, nof_e_bits, data, e_bits, 0);
|
||||
}
|
||||
|
||||
|
||||
bool decode_tb_cb(srslte_sch_t *q,
|
||||
srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm,
|
||||
uint32_t Qm, uint32_t rv, uint32_t nof_e_bits,
|
||||
int16_t *e_bits, uint8_t *data,
|
||||
uint32_t cb_size_group, uint8_t parity[3])
|
||||
{
|
||||
|
||||
bool cb_map[SRSLTE_MAX_CODEBLOCKS];
|
||||
|
||||
bzero(cb_map, sizeof(bool)*SRSLTE_MAX_CODEBLOCKS);
|
||||
|
||||
uint32_t cb_idx[SRSLTE_TDEC_NPAR];
|
||||
int16_t *decoder_input[SRSLTE_TDEC_NPAR];
|
||||
|
||||
uint32_t nof_cb = cb_size_group?cb_segm->C2:cb_segm->C1;
|
||||
uint32_t first_cb = cb_size_group?cb_segm->C1:0;
|
||||
uint32_t cb_len = cb_size_group?cb_segm->K2:cb_segm->K1;
|
||||
uint32_t cb_len_idx = cb_size_group?cb_segm->K2_idx:cb_segm->K1_idx;
|
||||
|
||||
uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24);
|
||||
uint32_t Gp = nof_e_bits / Qm;
|
||||
uint32_t gamma = cb_segm->C>0?Gp%cb_segm->C:Gp;
|
||||
uint32_t n_e = Qm * (Gp/cb_segm->C);
|
||||
|
||||
if (nof_cb > SRSLTE_MAX_CODEBLOCKS) {
|
||||
fprintf(stderr, "Error SRSLTE_MAX_CODEBLOCKS=%d\n", SRSLTE_MAX_CODEBLOCKS);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
|
||||
cb_idx[i] = i+first_cb;
|
||||
}
|
||||
|
||||
srslte_tdec_reset(&q->decoder, cb_len);
|
||||
|
||||
uint32_t remaining_cb = nof_cb;
|
||||
|
||||
while(remaining_cb>0) {
|
||||
uint32_t npar = SRSLTE_MIN(remaining_cb, SRSLTE_TDEC_NPAR);
|
||||
|
||||
// Unratematch the codeblocks left to decode
|
||||
for (int i=0;i<npar;i++) {
|
||||
|
||||
// Find a not processed CB
|
||||
cb_idx[i]=first_cb;
|
||||
while(cb_idx[i]<first_cb+nof_cb && cb_map[cb_idx[i]]) {
|
||||
cb_idx[i]++;
|
||||
}
|
||||
|
||||
cb_map[cb_idx[i]] = true;
|
||||
|
||||
uint32_t rp = cb_idx[i]*n_e;
|
||||
uint32_t n_e2 = n_e;
|
||||
|
||||
if (cb_idx[i] > cb_segm->C - gamma) {
|
||||
n_e2 = n_e+Qm;
|
||||
rp = (cb_segm->C - gamma)*n_e + (cb_idx[i]-(cb_segm->C - gamma))*n_e2;
|
||||
}
|
||||
|
||||
INFO("CB %d: rp=%d, n_e=%d, i=%d\n", cb_idx[i], rp, n_e2, i);
|
||||
if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[cb_idx[i]], n_e2, cb_len_idx, rv)) {
|
||||
fprintf(stderr, "Error in rate matching\n");
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
decoder_input[i] = softbuffer->buffer_f[cb_idx[i]];
|
||||
}
|
||||
|
||||
// Run 1 iteration for up to TDEC_NPAR codeblocks
|
||||
if (SRSLTE_TDEC_NPAR > 1) {
|
||||
INFO("Processing %d CBs, index %d,%d\n", npar, cb_idx[0], cb_idx[1]);
|
||||
}
|
||||
srslte_tdec_iteration_par(&q->decoder, decoder_input, npar, cb_len);
|
||||
|
||||
// Decide output bits and compute CRC
|
||||
for (int i=0;i<npar;i++) {
|
||||
srslte_tdec_decision_byte_par_cb(&q->decoder, q->cb_in, i, cb_len);
|
||||
|
||||
uint32_t len_crc;
|
||||
srslte_crc_t *crc_ptr;
|
||||
|
||||
if (cb_segm->C > 1) {
|
||||
len_crc = cb_len;
|
||||
crc_ptr = &q->crc_cb;
|
||||
} else {
|
||||
len_crc = cb_segm->tbs+24;
|
||||
crc_ptr = &q->crc_tb;
|
||||
}
|
||||
|
||||
// CRC is OK
|
||||
if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) {
|
||||
|
||||
uint32_t wp = cb_idx[i]*rlen;
|
||||
|
||||
// If it's not the last CB, copy data to another buffer and remove CRC */
|
||||
if (cb_idx[i] < cb_segm->C - 1) {
|
||||
memcpy(&data[wp/8], q->cb_in, rlen/8 * sizeof(uint8_t));
|
||||
// If it's the last CB Append Transport Block parity bits to the last CB
|
||||
} else {
|
||||
memcpy(&data[wp/8], q->cb_in, (rlen - 24)/8 * sizeof(uint8_t));
|
||||
memcpy(parity, &q->cb_in[(rlen - 24)/8], 3 * sizeof(uint8_t));
|
||||
}
|
||||
|
||||
// Reset number of iterations for that CB in the decoder
|
||||
srslte_tdec_reset_cb(&q->decoder, i);
|
||||
remaining_cb--;
|
||||
|
||||
// CRC is error and exceeded maximum iterations for this CB.
|
||||
// Early stop the whole transport block.
|
||||
} else if (srslte_tdec_get_nof_iterations_cb(&q->decoder, i) >= q->max_iterations) {
|
||||
INFO("CB %d: Error. TB is erroneous.\n", cb_idx[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a transport block according to 36.212 5.3.2
|
||||
|
@ -332,10 +448,6 @@ static int decode_tb(srslte_sch_t *q,
|
|||
uint32_t Qm, uint32_t rv, uint32_t nof_e_bits,
|
||||
int16_t *e_bits, uint8_t *data)
|
||||
{
|
||||
uint8_t parity[3] = {0, 0, 0};
|
||||
uint32_t par_rx, par_tx;
|
||||
uint32_t i;
|
||||
uint32_t cb_len, rp, wp, rlen, n_e;
|
||||
|
||||
if (q != NULL &&
|
||||
data != NULL &&
|
||||
|
@ -343,17 +455,11 @@ static int decode_tb(srslte_sch_t *q,
|
|||
e_bits != NULL &&
|
||||
cb_segm != NULL)
|
||||
{
|
||||
|
||||
|
||||
if (cb_segm->tbs == 0 || cb_segm->C == 0) {
|
||||
return SRSLTE_SUCCESS;
|
||||
}
|
||||
|
||||
rp = 0;
|
||||
rp = 0;
|
||||
wp = 0;
|
||||
uint32_t Gp = nof_e_bits / Qm;
|
||||
uint32_t gamma=Gp;
|
||||
|
||||
if (cb_segm->F) {
|
||||
fprintf(stderr, "Error filler bits are not supported. Use standard TBS\n");
|
||||
return SRSLTE_ERROR;
|
||||
|
@ -363,128 +469,41 @@ static int decode_tb(srslte_sch_t *q,
|
|||
fprintf(stderr, "Error number of CB (%d) exceeds soft buffer size (%d CBs)\n", cb_segm->C, softbuffer->max_cb);
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
if (cb_segm->C>0) {
|
||||
gamma = Gp%cb_segm->C;
|
||||
}
|
||||
|
||||
bool early_stop = true;
|
||||
for (i = 0; i < cb_segm->C && early_stop; i++) {
|
||||
|
||||
/* Get read/write lengths */
|
||||
uint32_t cblen_idx;
|
||||
if (i < cb_segm->C2) {
|
||||
cb_len = cb_segm->K2;
|
||||
cblen_idx = cb_segm->K2_idx;
|
||||
} else {
|
||||
cb_len = cb_segm->K1;
|
||||
cblen_idx = cb_segm->K1_idx;
|
||||
}
|
||||
|
||||
if (cb_segm->C == 1) {
|
||||
rlen = cb_len;
|
||||
} else {
|
||||
rlen = cb_len - 24;
|
||||
}
|
||||
|
||||
if (i <= cb_segm->C - gamma - 1) {
|
||||
n_e = Qm * (Gp/cb_segm->C);
|
||||
} else {
|
||||
n_e = Qm * ((uint32_t) ceilf((float) Gp/cb_segm->C));
|
||||
}
|
||||
|
||||
/* Rate Unmatching */
|
||||
if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[i], n_e, cblen_idx, rv)) {
|
||||
fprintf(stderr, "Error in rate matching\n");
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
if (SRSLTE_VERBOSE_ISDEBUG()) {
|
||||
char tmpstr[64];
|
||||
snprintf(tmpstr,64,"rmout_%d.dat",i);
|
||||
DEBUG("SAVED FILE %s: Encoded turbo code block %d\n", tmpstr, i);
|
||||
srslte_vec_save_file(tmpstr, softbuffer->buffer_f[i], (3*cb_len+12)*sizeof(int16_t));
|
||||
}
|
||||
|
||||
/* Turbo Decoding with CRC-based early stopping */
|
||||
q->nof_iterations = 0;
|
||||
uint32_t len_crc;
|
||||
srslte_crc_t *crc_ptr;
|
||||
early_stop = false;
|
||||
|
||||
srslte_tdec_reset(&q->decoder, cb_len);
|
||||
|
||||
do {
|
||||
srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len);
|
||||
q->nof_iterations++;
|
||||
|
||||
if (cb_segm->C > 1) {
|
||||
len_crc = cb_len;
|
||||
crc_ptr = &q->crc_cb;
|
||||
} else {
|
||||
len_crc = cb_segm->tbs+24;
|
||||
crc_ptr = &q->crc_tb;
|
||||
}
|
||||
|
||||
srslte_tdec_decision_byte(&q->decoder, q->cb_in, cb_len);
|
||||
|
||||
/* Check Codeblock CRC and stop early if correct */
|
||||
if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) {
|
||||
early_stop = true;
|
||||
}
|
||||
|
||||
} while (q->nof_iterations < q->max_iterations && !early_stop);
|
||||
q->average_nof_iterations = SRSLTE_VEC_EMA((float) q->nof_iterations, q->average_nof_iterations, 0.2);
|
||||
|
||||
INFO("CB#%d: cb_len: %d, rlen: %d, wp: %d, rp: %d, E: %d, n_iters=%d\n", i,
|
||||
cb_len, rlen, wp, rp, n_e, q->nof_iterations);
|
||||
|
||||
|
||||
// If CB CRC is not correct, early_stop will be false and wont continue with rest of CBs
|
||||
|
||||
/* Copy data to another buffer, removing the Codeblock CRC */
|
||||
if (i < cb_segm->C - 1) {
|
||||
memcpy(&data[wp/8], q->cb_in, rlen/8 * sizeof(uint8_t));
|
||||
} else {
|
||||
/* Append Transport Block parity bits to the last CB */
|
||||
memcpy(&data[wp/8], q->cb_in, (rlen - 24)/8 * sizeof(uint8_t));
|
||||
memcpy(parity, &q->cb_in[(rlen - 24)/8], 3 * sizeof(uint8_t));
|
||||
}
|
||||
|
||||
if (SRSLTE_VERBOSE_ISDEBUG()) {
|
||||
early_stop = true;
|
||||
}
|
||||
|
||||
/* Set read/write pointers */
|
||||
wp += rlen;
|
||||
rp += n_e;
|
||||
uint8_t parity[3] = {0, 0, 0};
|
||||
bool crc_ok = true;
|
||||
|
||||
uint32_t nof_cb_groups = cb_segm->C2>0?2:1;
|
||||
|
||||
// Process Codeblocks in groups of equal CB size to parallelize according to SRSLTE_TDEC_NPAR
|
||||
for (uint32_t i=0;i<nof_cb_groups && crc_ok;i++) {
|
||||
crc_ok = decode_tb_cb(q, softbuffer, cb_segm, Qm, rv, nof_e_bits, e_bits, data, i, parity);
|
||||
}
|
||||
|
||||
if (!early_stop) {
|
||||
INFO("CB %d failed. TB is erroneous.\n",i-1);
|
||||
return SRSLTE_ERROR;
|
||||
} else {
|
||||
INFO("END CB#%d: wp: %d, rp: %d\n", i, wp, rp);
|
||||
if (crc_ok) {
|
||||
|
||||
uint32_t par_rx = 0, par_tx = 0;
|
||||
|
||||
// Compute transport block CRC
|
||||
par_rx = srslte_crc_checksum_byte(&q->crc_tb, data, cb_segm->tbs);
|
||||
|
||||
// check parity bits
|
||||
par_tx = ((uint32_t) parity[0])<<16 | ((uint32_t) parity[1])<<8 | ((uint32_t) parity[2]);
|
||||
|
||||
|
||||
if (!par_rx) {
|
||||
INFO("Warning: Received all-zero transport block\n\n", 0);
|
||||
INFO("Warning: Received all-zero transport block\n\n",0);
|
||||
}
|
||||
|
||||
if (par_rx == par_tx) {
|
||||
INFO("TB decoded OK\n",i);
|
||||
INFO("TB decoded OK\n",0);
|
||||
return SRSLTE_SUCCESS;
|
||||
} else {
|
||||
INFO("Error in TB parity: par_tx=0x%x, par_rx=0x%x\n", par_tx, par_rx);
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
} else {
|
||||
return SRSLTE_ERROR_INVALID_INPUTS;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue