avx turbo decoder working in tests

This commit is contained in:
Ismael Gomez 2017-06-11 16:59:31 -05:00
parent 79e9bccbd8
commit c1ef9da32a
6 changed files with 1124 additions and 41 deletions

View File

@ -0,0 +1,135 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
/**********************************************************************************************
* File: turbodecoder.h
*
* Description: Turbo Decoder.
* Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
* encoders and one turbo code internal interleaver. The coding rate of turbo
* encoder is 1/3.
* MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
*
* Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
*********************************************************************************************/
#ifndef TURBODECODER_SSE_
#define TURBODECODER_SSE_
#include "srslte/config.h"
#include "srslte/phy/fec/tc_interl.h"
#include "srslte/phy/fec/cbsegm.h"
//#define ENABLE_SIMD_INTER
// The constant SRSLTE_TDEC_NPAR defines the maximum number of parallel CB supported by all SIMD decoders
#ifdef ENABLE_SIMD_INTER
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
#if LV_HAVE_AVX2
#define SRSLTE_TDEC_NPAR_INTRA 2
#else
#define SRSLTE_TDEC_NPAR_INTRA 1
#endif
#else
#if LV_HAVE_AVX2
#define SRSLTE_TDEC_NPAR 2
#else
#define SRSLTE_TDEC_NPAR 1
#endif
#endif
#define SRSLTE_TCOD_RATE 3
#define SRSLTE_TCOD_TOTALTAIL 12
#define SRSLTE_TCOD_MAX_LEN_CB 6144
#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
typedef struct SRSLTE_API {
uint32_t max_long_cb;
uint32_t max_par_cb;
int16_t *alpha;
int16_t *branch;
} map_gen_t;
typedef struct SRSLTE_API {
uint32_t max_long_cb;
uint32_t max_par_cb;
map_gen_t dec;
int16_t *app1[SRSLTE_TDEC_NPAR];
int16_t *app2[SRSLTE_TDEC_NPAR];
int16_t *ext1[SRSLTE_TDEC_NPAR];
int16_t *ext2[SRSLTE_TDEC_NPAR];
int16_t *syst[SRSLTE_TDEC_NPAR];
int16_t *parity0[SRSLTE_TDEC_NPAR];
int16_t *parity1[SRSLTE_TDEC_NPAR];
int cb_mask;
int current_cbidx;
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
int n_iter[SRSLTE_TDEC_NPAR];
} srslte_tdec_simd_t;
SRSLTE_API int srslte_tdec_simd_init(srslte_tdec_simd_t * h,
uint32_t max_par_cb,
uint32_t max_long_cb);
SRSLTE_API void srslte_tdec_simd_free(srslte_tdec_simd_t * h);
SRSLTE_API int srslte_tdec_simd_reset(srslte_tdec_simd_t * h,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h,
uint32_t cb_idx);
SRSLTE_API int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h,
uint32_t cb_idx);
SRSLTE_API void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h,
int16_t * input[SRSLTE_TDEC_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_decision(srslte_tdec_simd_t * h,
uint8_t *output[SRSLTE_TDEC_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h,
uint8_t *output[SRSLTE_TDEC_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h,
uint8_t *output,
uint32_t cbidx,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h,
int16_t * input[SRSLTE_TDEC_NPAR],
uint8_t *output[SRSLTE_TDEC_NPAR],
uint32_t nof_iterations,
uint32_t long_cb);
#endif

View File

@ -46,6 +46,7 @@ uint32_t seed = 0;
int K = -1;
#define MAX_ITERATIONS 10
int nof_cb = 1;
int nof_iterations = MAX_ITERATIONS;
int test_known_data = 0;
int test_errors = 0;
@ -59,6 +60,7 @@ void usage(char *prog) {
printf("Usage: %s [nlesv]\n", prog);
printf(
"\t-k Test with known data (ignores frame_length) [Default disabled]\n");
printf("\t-c nof_cb in parallel [Default %d]\n", nof_cb);
printf("\t-i nof_iterations [Default %d]\n", nof_iterations);
printf("\t-n nof_frames [Default %d]\n", nof_frames);
printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions);
@ -70,8 +72,11 @@ void usage(char *prog) {
void parse_args(int argc, char **argv) {
int opt;
while ((opt = getopt(argc, argv, "inNlstvekt")) != -1) {
while ((opt = getopt(argc, argv, "cinNlstvekt")) != -1) {
switch (opt) {
case 'c':
nof_cb = atoi(argv[optind]);
break;
case 'n':
nof_frames = atoi(argv[optind]);
break;
@ -112,7 +117,7 @@ int main(int argc, char **argv) {
float *llr;
short *llr_s;
uint8_t *llr_c;
uint8_t *data_tx, *data_rx, *data_rx_bytes, *symbols;
uint8_t *data_tx, *data_rx, *data_rx_bytes[SRSLTE_TDEC_NPAR], *symbols;
uint32_t i, j;
float var[SNR_POINTS];
uint32_t snr_points;
@ -154,10 +159,12 @@ int main(int argc, char **argv) {
perror("malloc");
exit(-1);
}
data_rx_bytes = srslte_vec_malloc(frame_length * sizeof(uint8_t));
if (!data_rx_bytes) {
perror("malloc");
exit(-1);
for (int cb=0;cb<SRSLTE_TDEC_NPAR;cb++) {
data_rx_bytes[cb] = srslte_vec_malloc(frame_length * sizeof(uint8_t));
if (!data_rx_bytes[cb]) {
perror("malloc");
exit(-1);
}
}
symbols = srslte_vec_malloc(coded_length * sizeof(uint8_t));
@ -232,7 +239,6 @@ int main(int argc, char **argv) {
for (j = 0; j < coded_length; j++) {
llr[j] = symbols[j] ? 1 : -1;
}
srslte_ch_awgn_f(llr, llr, var[i], coded_length);
for (j=0;j<coded_length;j++) {
@ -248,23 +254,38 @@ int main(int argc, char **argv) {
t = nof_iterations;
}
int16_t *input[SRSLTE_TDEC_NPAR];
uint8_t *output[SRSLTE_TDEC_NPAR];
input[0] = llr_s;
if (SRSLTE_TDEC_NPAR == 2)
input[1] = llr_s;
output[0] = data_rx_bytes[0];
if (SRSLTE_TDEC_NPAR == 2)
output[1] = data_rx_bytes[1];
gettimeofday(&tdata[1], NULL);
for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);
for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_run_all_par(&tdec, input, output, t, nof_cb, frame_length);
}
gettimeofday(&tdata[2], NULL);
get_time_interval(tdata);
mean_usec = (float) mean_usec * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
srslte_bit_unpack_vector(data_rx_bytes, data_rx, frame_length);
errors += srslte_bit_diff(data_tx, data_rx, frame_length);
frame_cnt++;
uint32_t errors_this = 0;
for (int cb=0;cb<nof_cb;cb++) {
srslte_bit_unpack_vector(data_rx_bytes[cb], data_rx, frame_length);
errors_this=srslte_bit_diff(data_tx, data_rx, frame_length);
//printf("error[%d]=%d\n", cb, errors_this);
errors += errors_this;
}
printf("Eb/No: %2.2f %10d/%d ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
printf("BER: %.2e ", (float) errors / (frame_cnt * frame_length));
printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec, mean_usec);
printf("\r");
printf("BER: %.2e ", (float) errors / (nof_cb*frame_cnt * frame_length));
printf("%3.1f Mbps (%6.2f usec)", (float) SRSLTE_TDEC_NPAR*frame_length / mean_usec, mean_usec);
printf("\r");
}
printf("\n");
@ -273,7 +294,7 @@ int main(int argc, char **argv) {
printf("\n");
if (snr_points == 1) {
if (errors) {
printf("%d Errors\n", errors);
printf("%d Errors\n", errors/nof_cb);
}
}

View File

@ -35,7 +35,7 @@
#ifdef LV_HAVE_SSE
#include "srslte/phy/fec/turbodecoder_sse.h"
#include "srslte/phy/fec/turbodecoder_simd.h"
#endif
#include "srslte/phy/utils/vector.h"
@ -43,7 +43,7 @@
int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
#ifdef LV_HAVE_SSE
return srslte_tdec_sse_init(&h->tdec_sse, max_long_cb);
return srslte_tdec_simd_init(&h->tdec_simd, max_long_cb);
#else
h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12));
if (!h->input_conv) {
@ -56,7 +56,7 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
void srslte_tdec_free(srslte_tdec_t * h) {
#ifdef LV_HAVE_SSE
srslte_tdec_sse_free(&h->tdec_sse);
srslte_tdec_simd_free(&h->tdec_simd);
#else
if (h->input_conv) {
free(h->input_conv);
@ -68,45 +68,74 @@ void srslte_tdec_free(srslte_tdec_t * h) {
int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) {
#ifdef LV_HAVE_SSE
return srslte_tdec_sse_reset(&h->tdec_sse, long_cb);
return srslte_tdec_simd_reset(&h->tdec_simd, long_cb);
#else
return srslte_tdec_gen_reset(&h->tdec_gen, long_cb);
#endif
}
void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) {
void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
#ifdef LV_HAVE_SSE
srslte_tdec_sse_iteration(&h->tdec_sse, input, long_cb);
srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb);
#else
srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12);
srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb);
#endif
}
void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
#ifdef LV_HAVE_SSE
return srslte_tdec_sse_decision(&h->tdec_sse, output, long_cb);
#else
return srslte_tdec_gen_decision(&h->tdec_gen, output, long_cb);
#endif
void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) {
int16_t *input_par[SRSLTE_TDEC_NPAR];
input_par[0] = input;
return srslte_tdec_iteration_par(h, input_par, 1, long_cb);
}
void srslte_tdec_decision_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
#ifdef LV_HAVE_SSE
return srslte_tdec_simd_decision(&h->tdec_simd, output, nof_cb, long_cb);
#else
return srslte_tdec_gen_decision(&h->tdec_gen, output[0], long_cb);
#endif
}
void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
uint8_t *output_par[SRSLTE_TDEC_NPAR];
output_par[0] = output;
srslte_tdec_decision_par(h, output_par, 1, long_cb);
}
void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
#ifdef LV_HAVE_SSE
srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb);
#else
srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb);
#endif
}
void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
uint8_t *output_par[SRSLTE_TDEC_NPAR];
output_par[0] = output;
srslte_tdec_decision_byte_par(h, output_par, 1, long_cb);
}
int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_NPAR],
uint8_t *output[SRSLTE_TDEC_NPAR],
uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb) {
#ifdef LV_HAVE_SSE
return srslte_tdec_sse_decision_byte(&h->tdec_sse, output, long_cb);
return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb);
#else
return srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb);
srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb);
#endif
}
int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb)
{
#ifdef LV_HAVE_SSE
return srslte_tdec_sse_run_all(&h->tdec_sse, input, output, nof_iterations, long_cb);
#else
srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12);
return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output, nof_iterations, long_cb);
#endif
uint8_t *output_par[SRSLTE_TDEC_NPAR];
output_par[0] = output;
int16_t *input_par[SRSLTE_TDEC_NPAR];
input_par[0] = input;
return srslte_tdec_run_all_par(h, input_par, output_par, nof_iterations, 1, long_cb);
}

View File

@ -0,0 +1,401 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_simd.h"
#include "srslte/phy/utils/vector.h"
#include <inttypes.h>
#define NUMSTATES 8
#define NINPUTS 2
#define TAIL 3
#define TOTALTAIL 12
#define INF 10000
#define ZERO 0
#ifdef LV_HAVE_AVX2
#include <smmintrin.h>
#include <immintrin.h>
// Number of CB processed in parllel in AVX
#define NCB 2
/*
static void print_256i(__m256i x) {
int16_t *s = (int16_t*) &x;
printf("[%d", s[0]);
for (int i=1;i<16;i++) {
printf(",%d", s[i]);
}
printf("]\n");
}
*/
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
static inline int16_t hMax0(__m256i masked_value)
{
__m128i tmp1 = _mm256_extractf128_si256(masked_value, 0);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
static inline int16_t hMax1(__m256i masked_value)
{
__m128i tmp1 = _mm256_extractf128_si256(masked_value, 1);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
/* Computes beta values */
void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb)
{
int k;
uint32_t end = long_cb + 3;
const __m256i *alphaPtr = (const __m256i*) s->alpha;
__m256i beta_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
__m256i g, bp, bn, alpha_k;
/* Define the shuffle constant for the positive beta */
__m256i shuf_bp = _mm256_set_epi8(
// 1st CB
15+16, 14+16, // 7
7+16, 6+16, // 3
5+16, 4+16, // 2
13+16, 12+16, // 6
11+16, 10+16, // 5
3+16, 2+16, // 1
1+16, 0+16, // 0
9+16, 8+16, // 4
// 2nd CB
15, 14, // 7
7, 6, // 3
5, 4, // 2
13, 12, // 6
11, 10, // 5
3, 2, // 1
1, 0, // 0
9, 8 // 4
);
/* Define the shuffle constant for the negative beta */
__m256i shuf_bn = _mm256_set_epi8(
7+16, 6+16, // 3
15+16, 14+16, // 7
13+16, 12+16, // 6
5+16, 4+16, // 2
3+16, 2+16, // 1
11+16, 10+16, // 5
9+16, 8+16, // 4
1+16, 0+16, // 0
7, 6, // 3
15, 14, // 7
13, 12, // 6
5, 4, // 2
3, 2, // 1
11, 10, // 5
9, 8, // 4
1, 0 // 0
);
alphaPtr += long_cb-1;
/* Define shuffle for branch costs */
__m256i shuf_g[4];
shuf_g[3] = _mm256_set_epi8(3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,
3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
shuf_g[2] = _mm256_set_epi8(7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,
7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
shuf_g[1] = _mm256_set_epi8(11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,
11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
shuf_g[0] = _mm256_set_epi8(15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,
15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
/* Define shuffle for beta normalization */
__m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
__m256i gv;
int16_t *b = &s->branch[2*NCB*long_cb-16];
__m256i *gPtr = (__m256i*) b;
__m256i bn2, bp2;
/* This defines a beta computation step:
* Adds and substracts the branch metrics to the previous beta step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define BETA_STEP(g) bp = _mm256_add_epi16(beta_k, g);\
bn = _mm256_sub_epi16(beta_k, g);\
bp = _mm256_shuffle_epi8(bp, shuf_bp);\
bn = _mm256_shuffle_epi8(bn, shuf_bn);\
beta_k = _mm256_max_epi16(bp, bn);
/* Loads the alpha metrics from memory and adds them to the temporal bn and bp
* metrics. Then computes horizontal maximum of both metrics and computes difference
*/
#define BETA_STEP_CNT(c,d) g = _mm256_shuffle_epi8(gv, shuf_g[c]);\
BETA_STEP(g)\
alpha_k = _mm256_load_si256(alphaPtr);\
alphaPtr--;\
bp = _mm256_add_epi16(bp, alpha_k);\
bn = _mm256_add_epi16(bn, alpha_k);\
bn2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bn);\
bp2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bp);\
output[0][k-d] = hMax0(bn2) - hMax0(bp2);\
output[1][k-d] = hMax1(bn2) - hMax1(bp2);
/* The tail does not require to load alpha or produce outputs. Only update
* beta metrics accordingly */
for (k=end-1; k>=long_cb; k--) {
int16_t g0_1 = s->branch[2*NCB*k];
int16_t g1_1 = s->branch[2*NCB*k+1];
int16_t g0_2 = s->branch[2*NCB*k+6];
int16_t g1_2 = s->branch[2*NCB*k+6+1];
g = _mm256_set_epi16(g1_2, g0_2, g0_2, g1_2, g1_2, g0_2, g0_2, g1_2, g1_1, g0_1, g0_1, g1_1, g1_1, g0_1, g0_1, g1_1);
BETA_STEP(g);
}
/* We inline 2 trelis steps for each normalization */
__m256i norm;
for (; k >= 0; k-=8) {
gv = _mm256_load_si256(gPtr);
gPtr--;
BETA_STEP_CNT(0,0);
BETA_STEP_CNT(1,1);
BETA_STEP_CNT(2,2);
BETA_STEP_CNT(3,3);
norm = _mm256_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm256_sub_epi16(beta_k, norm);
gv = _mm256_load_si256(gPtr);
gPtr--;
BETA_STEP_CNT(0,4);
BETA_STEP_CNT(1,5);
BETA_STEP_CNT(2,6);
BETA_STEP_CNT(3,7);
norm = _mm256_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm256_sub_epi16(beta_k, norm);
}
}
/* Computes alpha metrics */
void map_avx_alpha(map_gen_t * s, uint32_t long_cb)
{
uint32_t k;
int16_t *alpha1 = s->alpha;
int16_t *alpha2 = &s->alpha[8];
uint32_t i;
alpha1[0] = 0;
alpha2[0] = 0;
for (i = 1; i < 8; i++) {
alpha1[i] = -INF;
alpha2[i] = -INF;
}
/* Define the shuffle constant for the positive alpha */
__m256i shuf_ap = _mm256_set_epi8(
// 1st CB
31, 30, // 7
25, 24, // 4
23, 22, // 3
17, 16, // 0
29, 28, // 6
27, 26, // 5
21, 20, // 2
19, 18, // 1
// 2nd CB
15, 14, // 7
9, 8, // 4
7, 6, // 3
1, 0, // 0
13, 12, // 6
11, 10, // 5
5, 4, // 2
3, 2 // 1
);
/* Define the shuffle constant for the negative alpha */
__m256i shuf_an = _mm256_set_epi8(
// 1nd CB
29, 28, // 6
27, 26, // 5
21, 20, // 2
19, 18, // 1
31, 30, // 7
25, 24, // 4
23, 22, // 3
17, 16, // 0
// 2nd CB
13, 12, // 6
11, 10, // 5
5, 4, // 2
3, 2, // 1
15, 14, // 7
9, 8, // 4
7, 6, // 3
1, 0 // 0
);
/* Define shuffle for branch costs */
__m256i shuf_g[4];
shuf_g[0] = _mm256_set_epi8(3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,
3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
shuf_g[1] = _mm256_set_epi8(7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,
7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
shuf_g[2] = _mm256_set_epi8(11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,
11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
shuf_g[3] = _mm256_set_epi8(15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,
15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);
__m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
__m256i* alphaPtr = (__m256i*) s->alpha;
alphaPtr++;
__m256i gv;
__m256i *gPtr = (__m256i*) s->branch;
__m256i g, ap, an;
__m256i alpha_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
/* This defines a alpha computation step:
* Adds and substracts the branch metrics to the previous alpha step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define ALPHA_STEP(c) g = _mm256_shuffle_epi8(gv, shuf_g[c]); \
ap = _mm256_add_epi16(alpha_k, g);\
an = _mm256_sub_epi16(alpha_k, g);\
ap = _mm256_shuffle_epi8(ap, shuf_ap);\
an = _mm256_shuffle_epi8(an, shuf_an);\
alpha_k = _mm256_max_epi16(ap, an);\
_mm256_store_si256(alphaPtr, alpha_k);\
alphaPtr++; \
/* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
__m256i norm;
for (k = 0; k < long_cb/8; k++) {
gv = _mm256_load_si256(gPtr);
gPtr++;
ALPHA_STEP(0);
ALPHA_STEP(1);
ALPHA_STEP(2);
ALPHA_STEP(3);
norm = _mm256_shuffle_epi8(alpha_k, shuf_norm);
alpha_k = _mm256_sub_epi16(alpha_k, norm);
gv = _mm256_load_si256(gPtr);
gPtr++;
ALPHA_STEP(0);
ALPHA_STEP(1);
ALPHA_STEP(2);
ALPHA_STEP(3);
norm = _mm256_shuffle_epi8(alpha_k, shuf_norm);
alpha_k = _mm256_sub_epi16(alpha_k, norm);
}
}
/* Compute branch metrics (gamma) */
void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb)
{
__m128i res10, res20, res11, res21, res1, res2;
__m128i in, ap, pa, g1, g0;
__m128i *inPtr = (__m128i*) input;
__m128i *appPtr = (__m128i*) app;
__m128i *paPtr = (__m128i*) parity;
__m128i *resPtr = (__m128i*) h->branch;
if (cbidx) {
resPtr++;
}
__m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
__m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
__m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
__m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
for (int i=0;i<long_cb/8;i++) {
in = _mm_load_si128(inPtr);
inPtr++;
pa = _mm_load_si128(paPtr);
paPtr++;
if (appPtr) {
ap = _mm_load_si128(appPtr);
appPtr++;
in = _mm_add_epi16(ap, in);
}
g1 = _mm_add_epi16(in, pa);
g0 = _mm_sub_epi16(in, pa);
g1 = _mm_srai_epi16(g1, 1);
g0 = _mm_srai_epi16(g0, 1);
res10 = _mm_shuffle_epi8(g0, res10_mask);
res20 = _mm_shuffle_epi8(g0, res20_mask);
res11 = _mm_shuffle_epi8(g1, res11_mask);
res21 = _mm_shuffle_epi8(g1, res21_mask);
res1 = _mm_or_si128(res10, res11);
res2 = _mm_or_si128(res20, res21);
_mm_store_si128(resPtr, res1);
resPtr++;
resPtr++;
_mm_store_si128(resPtr, res2);
resPtr++;
resPtr++;
}
for (int i=long_cb;i<long_cb+3;i++) {
h->branch[2*i*NCB+cbidx*6] = (input[i] - parity[i])/2;
h->branch[2*i*NCB+cbidx*6+1] = (input[i] + parity[i])/2;
}
}
#endif

View File

@ -0,0 +1,486 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_simd.h"
#include "srslte/phy/utils/vector.h"
#include <inttypes.h>
#define NUMSTATES 8
#define NINPUTS 2
#define TAIL 3
#define TOTALTAIL 12
#define INF 10000
#define ZERO 0
#ifdef LV_HAVE_SSE
#include <smmintrin.h>
// Define SSE/AVX implementations
void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb);
void map_sse_alpha(map_gen_t * s, uint32_t long_cb);
void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb);
#ifdef LV_HAVE_AVX2
void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb);
void map_avx_alpha(map_gen_t * s, uint32_t long_cb);
void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb);
#endif
void map_simd_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
if (nof_cb == 1) {
map_sse_beta(s, output[0], long_cb);
}
#ifdef LV_HAVE_AVX2
else if (nof_cb == 2) {
map_avx_beta(s, output, long_cb);
}
#endif
}
void map_simd_alpha(map_gen_t * s, uint32_t nof_cb, uint32_t long_cb)
{
if (nof_cb == 1) {
map_sse_alpha(s, long_cb);
}
#ifdef LV_HAVE_AVX2
else if (nof_cb == 2) {
map_avx_alpha(s, long_cb);
}
#endif
}
void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t nof_cb, uint32_t long_cb)
{
if (nof_cb == 1) {
map_sse_gamma(s, input, app, parity, long_cb);
}
#ifdef LV_HAVE_AVX2
else if (nof_cb == 2) {
map_avx_gamma(s, input, app, parity, cbidx, long_cb);
}
#endif
}
/* Inititalizes constituent decoder object */
int map_simd_init(map_gen_t * h, int max_long_cb)
{
bzero(h, sizeof(map_gen_t));
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR);
if (!h->alpha) {
perror("srslte_vec_malloc");
return -1;
}
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR);
if (!h->branch) {
perror("srslte_vec_malloc");
return -1;
}
h->max_long_cb = max_long_cb;
return 0;
}
void map_simd_free(map_gen_t * h)
{
if (h->alpha) {
free(h->alpha);
}
if (h->branch) {
free(h->branch);
}
bzero(h, sizeof(map_gen_t));
}
/* Runs one instance of a decoder */
void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_NPAR], int16_t *app[SRSLTE_TDEC_NPAR], int16_t * parity[SRSLTE_TDEC_NPAR],
int16_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
// Compute branch metrics
for (int i=0;i<nof_cb;i++) {
map_simd_gamma(h, input[i], app?app[i]:NULL, parity[i], i, nof_cb, long_cb);
}
// Forward recursion
map_simd_alpha(h, nof_cb, long_cb);
// Backwards recursion + LLR computation
map_simd_beta(h, output, nof_cb, long_cb);
}
/* Initializes the turbo decoder object */
int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_long_cb)
{
int ret = -1;
bzero(h, sizeof(srslte_tdec_simd_t));
uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
h->max_long_cb = max_long_cb;
for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->app1[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->app2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->app2[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->ext1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->ext1[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->ext2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->ext2[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->syst[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->syst[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity0[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->parity0[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->parity1[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
}
if (map_simd_init(&h->dec, h->max_long_cb)) {
goto clean_and_exit;
}
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
goto clean_and_exit;
}
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
}
h->current_cbidx = -1;
ret = 0;
clean_and_exit:if (ret == -1) {
srslte_tdec_simd_free(h);
}
return ret;
}
void srslte_tdec_simd_free(srslte_tdec_simd_t * h)
{
for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
if (h->app1[i]) {
free(h->app1[i]);
}
if (h->app2[i]) {
free(h->app2[i]);
}
if (h->ext1[i]) {
free(h->ext1[i]);
}
if (h->ext2[i]) {
free(h->ext2[i]);
}
if (h->syst[i]) {
free(h->syst[i]);
}
if (h->parity0[i]) {
free(h->parity0[i]);
}
if (h->parity1[i]) {
free(h->parity1[i]);
}
}
map_simd_free(&h->dec);
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
srslte_tc_interl_free(&h->interleaver[i]);
}
bzero(h, sizeof(srslte_tdec_simd_t));
}
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into
* 3 buffers ready to be used by compute_gamma()
*/
void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) {
uint32_t i;
__m128i *inputPtr = (__m128i*) input;
__m128i in0, in1, in2;
__m128i s0, s1, s2, s;
__m128i p00, p01, p02, p0;
__m128i p10, p11, p12, p1;
__m128i *sysPtr = (__m128i*) h->syst[cbidx];
__m128i *pa0Ptr = (__m128i*) h->parity0[cbidx];
__m128i *pa1Ptr = (__m128i*) h->parity1[cbidx];
// pick bits 0, 3, 6 from 1st word
__m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
// pick bits 1, 4, 7 from 2st word
__m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 3rd word
__m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 1st word
__m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
// pick bits 2, 5, from 2st word
__m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 0, 3, 6 from 3rd word
__m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 1st word
__m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
// pick bits 0, 3, 6, from 2st word
__m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 3rd word
__m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// Split systematic and parity bits
for (i = 0; i < long_cb/8; i++) {
in0 = _mm_load_si128(inputPtr); inputPtr++;
in1 = _mm_load_si128(inputPtr); inputPtr++;
in2 = _mm_load_si128(inputPtr); inputPtr++;
/* Deinterleave Systematic bits */
s0 = _mm_shuffle_epi8(in0, s0_mask);
s1 = _mm_shuffle_epi8(in1, s1_mask);
s2 = _mm_shuffle_epi8(in2, s2_mask);
s = _mm_or_si128(s0, s1);
s = _mm_or_si128(s, s2);
_mm_store_si128(sysPtr, s);
sysPtr++;
/* Deinterleave parity 0 bits */
p00 = _mm_shuffle_epi8(in0, p00_mask);
p01 = _mm_shuffle_epi8(in1, p01_mask);
p02 = _mm_shuffle_epi8(in2, p02_mask);
p0 = _mm_or_si128(p00, p01);
p0 = _mm_or_si128(p0, p02);
_mm_store_si128(pa0Ptr, p0);
pa0Ptr++;
/* Deinterleave parity 1 bits */
p10 = _mm_shuffle_epi8(in0, p10_mask);
p11 = _mm_shuffle_epi8(in1, p11_mask);
p12 = _mm_shuffle_epi8(in2, p12_mask);
p1 = _mm_or_si128(p10, p11);
p1 = _mm_or_si128(p1, p12);
_mm_store_si128(pa1Ptr, p1);
pa1Ptr++;
}
for (i = 0; i < 3; i++) {
h->syst[cbidx][i+long_cb] = input[3*long_cb + 2*i];
h->parity0[cbidx][i+long_cb] = input[3*long_cb + 2*i + 1];
}
for (i = 0; i < 3; i++) {
h->app2[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i];
h->parity1[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
}
}
/* Runs 1 turbo decoder iteration */
void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
if (h->current_cbidx >= 0) {
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
if (h->n_iter == 0) {
for (int i=0;i<nof_cb;i++) {
deinterleave_input_simd(h, input[i], i, long_cb);
}
}
// Add apriori information to decoder 1
if (h->n_iter > 0) {
for (int i=0;i<nof_cb;i++) {
srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb);
}
}
// Run MAP DEC #1
if (h->n_iter == 0) {
map_simd_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, nof_cb, long_cb);
} else {
map_simd_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, nof_cb, long_cb);
}
// Convert aposteriori information into extrinsic information
if (h->n_iter > 0) {
for (int i=0;i<nof_cb;i++) {
srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb);
}
}
// Interleave extrinsic output of DEC1 to form apriori info for decoder 2
for (int i=0;i<nof_cb;i++) {
srslte_vec_lut_sss(h->ext1[i], deinter, h->app2[i], long_cb);
}
// Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
map_simd_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, nof_cb, long_cb);
// Deinterleaved extrinsic bits become apriori info for decoder 1
for (int i=0;i<nof_cb;i++) {
srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb);
}
h->n_iter++;
} else {
fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n");
}
}
/* Resets the decoder and sets the codeblock length */
int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb)
{
if (long_cb > h->max_long_cb) {
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
h->max_long_cb);
return -1;
}
h->n_iter = 0;
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
if (h->current_cbidx < 0) {
fprintf(stderr, "Invalid CB length %d\n", long_cb);
return -1;
}
return 0;
}
void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
{
__m128i zero = _mm_set1_epi16(0);
__m128i lsb_mask = _mm_set1_epi16(1);
__m128i *appPtr = (__m128i*) h->app1[cbidx];
__m128i *outPtr = (__m128i*) output;
__m128i ap, out, out0, out1;
for (uint32_t i = 0; i < long_cb/16; i++) {
ap = _mm_load_si128(appPtr); appPtr++;
out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
ap = _mm_load_si128(appPtr); appPtr++;
out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
out = _mm_packs_epi16(out0, out1);
_mm_store_si128(outPtr, out);
outPtr++;
}
if (long_cb%16) {
for (int i=0;i<8;i++) {
output[long_cb-8+i] = h->app1[cbidx][long_cb-8+i]>0?1:0;
}
}
}
void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
for (int i=0;i<nof_cb;i++) {
tdec_simd_decision(h, output[i], i, long_cb);
}
}
void tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
{
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
// long_cb is always byte aligned
for (uint32_t i = 0; i < long_cb/8; i++) {
uint8_t out0 = h->app1[cbidx][8*i+0]>0?mask[0]:0;
uint8_t out1 = h->app1[cbidx][8*i+1]>0?mask[1]:0;
uint8_t out2 = h->app1[cbidx][8*i+2]>0?mask[2]:0;
uint8_t out3 = h->app1[cbidx][8*i+3]>0?mask[3]:0;
uint8_t out4 = h->app1[cbidx][8*i+4]>0?mask[4]:0;
uint8_t out5 = h->app1[cbidx][8*i+5]>0?mask[5]:0;
uint8_t out6 = h->app1[cbidx][8*i+6]>0?mask[6]:0;
uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0;
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
//if (i<10) {
// printf("output[%d]=%d\n",i,output[i]);
//}
}
}
void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
for (int i=0;i<nof_cb;i++) {
tdec_simd_decision_byte(h, output[i], i, long_cb);
}
}
/* Runs nof_iterations iterations and decides the output bits */
int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint8_t *output[SRSLTE_TDEC_NPAR],
uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb)
{
if (srslte_tdec_simd_reset(h, long_cb)) {
return SRSLTE_ERROR;
}
do {
srslte_tdec_simd_iteration(h, input, nof_cb, long_cb);
} while (h->n_iter < nof_iterations);
srslte_tdec_simd_decision_byte(h, output, nof_cb, long_cb);
return SRSLTE_SUCCESS;
}
#endif

View File

@ -52,6 +52,17 @@
#ifdef LV_HAVE_SSE
/*
static void print_128i(__m128i x) {
int16_t *s = (int16_t*) &x;
printf("[%d", s[0]);
for (int i=1;i<8;i++) {
printf(",%d", s[i]);
}
printf("]\n");
}
*/
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
static inline int16_t hMax(__m128i buffer)
{
@ -126,7 +137,8 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
alpha_k = _mm_load_si128(alphaPtr);\
alphaPtr--;\
bp = _mm_add_epi16(bp, alpha_k);\
bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp);
bn = _mm_add_epi16(bn, alpha_k);\
output[k-d] = hMax(bn) - hMax(bp);
/* The tail does not require to load alpha or produce outputs. Only update
* beta metrics accordingly */
@ -134,7 +146,6 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
int16_t g0 = s->branch[2*k];
int16_t g1 = s->branch[2*k+1];
g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
BETA_STEP(g);
}