srsLTE/lib/include/srslte/phy/fec/turbodecoder_win.h

750 lines
20 KiB
C
Raw Normal View History

/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include "srslte/config.h"
#define MAKE_FUNC(a) CONCAT2(CONCAT2(tdec_win,WINIMP),CONCAT2(_,a))
#define MAKE_TYPE CONCAT2(CONCAT2(tdec_win_,WINIMP),_t)
#ifdef WINIMP_IS_SSE16
#ifndef LV_HAVE_SSE
#error "Selected SSE window decoder but instruction set not supported"
#endif
#include <nmmintrin.h>
#define WINIMP sse16
#define nof_blocks 8
#define llr_t int16_t
#define simd_type_t __m128i
#define simd_load _mm_load_si128
#define simd_store _mm_store_si128
#define simd_add _mm_adds_epi16
#define simd_sub _mm_subs_epi16
#define simd_max _mm_max_epi16
#define simd_set1 _mm_set1_epi16
#define simd_insert _mm_insert_epi16
#define simd_shuffle _mm_shuffle_epi8
#define move_right _mm_set_epi8(15,14,15,14,13,12,11,10,9,8,7,6,5,4,3,2)
#define move_left _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0)
#define simd_rb_shift _mm_srai_epi16
#define normalize_period 2
#define win_overlap_len 40
#define divide_output 1
#define INF 10000
#else
#ifdef WINIMP_IS_AVX16
#ifndef LV_HAVE_AVX
#error "Selected AVX window decoder but instruction set not supported"
#endif
#include <immintrin.h>
#define WINIMP avx16
#define nof_blocks 16
#define llr_t int16_t
#define simd_type_t __m256i
#define simd_load _mm256_load_si256
#define simd_store _mm256_store_si256
#define simd_add _mm256_adds_epi16
#define simd_sub _mm256_subs_epi16
#define simd_max _mm256_max_epi16
#define simd_set1 _mm256_set1_epi16
#define simd_insert _mm256_insert_epi16
#define simd_shuffle _mm256_shuffle_epi8
#define move_right _mm256_set_epi8(31,30,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)
#define move_left _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0)
#define normalize_period 2
#define win_overlap_len 40
#define INF 10000
#else
#ifdef WINIMP_IS_SSE8
#ifndef LV_HAVE_SSE
#error "Selected SSE window decoder but instruction set not supported"
#endif
#include <nmmintrin.h>
#define WINIMP sse8
#define nof_blocks 16
#define llr_t int8_t
#define simd_type_t __m128i
#define simd_load _mm_load_si128
#define simd_store _mm_store_si128
#define simd_add _mm_adds_epi8
#define simd_sub _mm_subs_epi8
#define simd_max _mm_max_epi8
#define simd_set1 _mm_set1_epi8
#define simd_insert _mm_insert_epi8
#define simd_shuffle _mm_shuffle_epi8
#define move_right _mm_set_epi8(15,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)
#define move_left _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0)
#define simd_rb_shift simd_rb_shift_128
#define normalize_max
#define normalize_period 1
#define win_overlap_len 40
#define use_saturated_add
#define divide_output 1
#define INF 0
inline static simd_type_t simd_rb_shift_128(simd_type_t v, const int l) {
__m128i low = _mm_srai_epi16(_mm_slli_epi16(v,8), l+8);
__m128i hi = _mm_srai_epi16(v,l);
return _mm_blendv_epi8(hi, low, _mm_set1_epi32(0x00FF00FF));
}
#else
#ifdef WINIMP_IS_AVX8
#ifndef LV_HAVE_AVX
#error "Selected AVX window decoder but instruction set not supported"
#endif
#include <immintrin.h>
#define WINIMP avx8
#define nof_blocks 32
#define llr_t int8_t
#define simd_type_t __m256i
#define simd_load _mm256_load_si256
#define simd_store _mm256_store_si256
#define simd_add _mm256_adds_epi8
#define simd_sub _mm256_subs_epi8
#define simd_max _mm256_max_epi8
#define simd_set1 _mm256_set1_epi8
#define simd_insert _mm256_insert_epi8
#define simd_shuffle _mm256_shuffle_epi8
#define move_right _mm256_set_epi8(31,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)
#define move_left _mm256_set_epi8(30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0)
#define simd_rb_shift simd_rb_shift_256
#define INF 0
#define normalize_max
#define normalize_period 1
#define win_overlap_len 40
#define use_saturated_add
#define divide_output 1
inline static simd_type_t simd_rb_shift_256(simd_type_t v, const int l) {
__m256i low = _mm256_srai_epi16(_mm256_slli_epi16(v,8), l+8);
__m256i hi = _mm256_srai_epi16(v,l);
return _mm256_blendv_epi8(hi, low, _mm256_set1_epi32(0x00FF00FF));
}
#else
#error "Unknown WINIMP value"
#endif
#endif
#endif
#endif
typedef struct SRSLTE_API {
uint32_t max_long_cb;
llr_t *beta;
} MAKE_TYPE;
#define long_sb (long_cb/nof_blocks)
#define debug_enabled_win 0
#if debug_enabled_win
#define debug_state(d) printf("k=%5d, in=%5d, pa=%3d, out=%5d, alpha=[", d*long_sb+k+1, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d), MAKE_FUNC(get_simd)(out,d)); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d)); \
printf("], beta=["); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(beta_save[j], d));printf("\n");
#define debug_state_pre(d) printf("pre-window k=%5d, in=%5d, pa=%3d, alpha=[", (d+1)*long_sb-loop_len+k+1, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d)); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d)); \
printf("]\n");
#define debug_state_beta(d) printf("k=%5d, in=%5d, pa=%3d, beta=[", d*long_sb+k, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d)); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d));\
printf("\n");
static llr_t MAKE_FUNC(get_simd)(simd_type_t x, uint32_t pos) {
llr_t *s = (llr_t*) &x;
return s[pos];
}
#else
#define debug_state(a)
#define debug_state_pre(a)
#define debug_state_beta(a)
#endif
/*
static void MAKE_FUNC(print_simd)(simd_type_t x) {
llr_t *s = (llr_t*) &x;
printf("[");
for (int i=0;i<nof_blocks;i++) {
printf("%4d, ", s[i]);
}
printf("]\n");
}*/
inline static llr_t MAKE_FUNC(sadd)(llr_t x, llr_t y) {
#ifndef use_saturated_add
return x+y;
#else
int16_t z = (int16_t) x+y;
return z>127?127:(int8_t) z;
#endif
}
inline static void MAKE_FUNC(normalize)(uint32_t k, simd_type_t old[8]) {
if ((k % normalize_period) == 0 && k != 0) {
#ifdef normalize_max
simd_type_t m = simd_max(old[0],old[1]);
for (int i=2;i<8;i++) {
m = simd_max(m,old[i]);
}
for (int i=0;i<8;i++) {
old[i] = simd_sub(old[i], m);
}
#else
for (int i = 1; i < 8; i++) {
old[i] = simd_sub(old[i], old[0]);
}
old[0] = simd_set1(0);
#endif
}
}
static void MAKE_FUNC(beta_trellis)(llr_t *input, llr_t *parity, uint32_t long_cb, llr_t old[8])
{
llr_t m_b[8], new[8];
llr_t x, y, xy;
/* Calculate last state using Tail. No need to use SIMD here */
old[0] = 0;
for (int i = 1; i < 8; i++) {
old[i] = -INF;
}
for (int k=long_cb+2;k >= long_cb; k--) {
x = input[k];
y = parity[k];
xy = MAKE_FUNC(sadd)(x, y);
m_b[0] = MAKE_FUNC(sadd)(old[4],xy);
m_b[1] = old[4];
m_b[2] = MAKE_FUNC(sadd)(old[5], y);
m_b[3] = MAKE_FUNC(sadd)(old[5], x);
m_b[4] = MAKE_FUNC(sadd)(old[6], x);
m_b[5] = MAKE_FUNC(sadd)(old[6], y);
m_b[6] = old[7];
m_b[7] = MAKE_FUNC(sadd)(old[7], xy);
new[0] = old[0];
new[1] = MAKE_FUNC(sadd)(old[0], xy);
new[2] = MAKE_FUNC(sadd)(old[1], x);
new[3] = MAKE_FUNC(sadd)(old[1], y);
new[4] = MAKE_FUNC(sadd)(old[2], y);
new[5] = MAKE_FUNC(sadd)(old[2], x);
new[6] = MAKE_FUNC(sadd)(old[3], xy);
new[7] = old[3];
#if debug_enabled_win
printf("trellis: k=%d, in=%d, pa=%d, beta: ", k, x, y); for (int i=0;i<8;i++) {printf("%d,", old[i]);} printf("\n");
#endif
for (int i = 0; i < 8; i++) {
if (m_b[i] > new[i])
new[i] = m_b[i];
old[i] = new[i];
}
}
}
/* Computes beta values */
static void MAKE_FUNC(beta)(MAKE_TYPE * s, llr_t *input, llr_t *app, llr_t *parity, uint32_t long_cb)
{
simd_type_t m_b[8], new[8], old[8];
simd_type_t x, y, xy, ap;
simd_type_t *inputPtr;
simd_type_t *appPtr;
simd_type_t *parityPtr;
simd_type_t *betaPtr = (simd_type_t*) s->beta;
uint32_t loop_len;
for (int j=0;j<2;j++) {
// First run L states to find initial state for all sub-blocks after first
if (j==0) {
loop_len = win_overlap_len;
} else {
loop_len = long_sb;
}
// When passing through all window pick estimated initial states (known state for sb=0)
if (loop_len == long_sb) {
// shuffle across 128-bit boundary manually
#ifdef WINIMP_IS_AVX16
llr_t tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = _mm256_extract_epi16(old[i], 8);
}
#endif
#ifdef WINIMP_IS_AVX8
llr_t tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = _mm256_extract_epi8(old[i], 16);
}
#endif
for (int i = 0; i < 8; i++) {
old[i] = simd_shuffle(old[i], move_right);
}
// last sub-block state is calculated from the trellis
llr_t trellis_old[8];
MAKE_FUNC(beta_trellis)(input, parity, long_cb, trellis_old);
for (int i = 0; i < 8; i++) {
old[i] = simd_insert(old[i], trellis_old[i], nof_blocks-1);
}
#ifdef WINIMP_IS_AVX16
for (int i = 0; i < 8; i++) {
old[i] = _mm256_insert_epi16(old[i], tmp[i], 7);
}
#endif
#ifdef WINIMP_IS_AVX8
for (int i = 0; i < 8; i++) {
old[i] = _mm256_insert_epi8(old[i], tmp[i], 15);
}
#endif
inputPtr = (simd_type_t*) &input[long_cb-nof_blocks];
appPtr = (simd_type_t*) &app[long_cb-nof_blocks];
parityPtr = (simd_type_t*) &parity[long_cb-nof_blocks];
for (int i = 0; i < 8; i++) {
simd_store(&betaPtr[8*long_sb + i], old[i]);
}
} else {
// when estimating states, just set all to unknown
for (int i = 0; i < 8; i++) {
old[i] = simd_set1(-INF);
}
inputPtr = (simd_type_t*) &input[nof_blocks*(loop_len-1)];
appPtr = (simd_type_t*) &app[nof_blocks*(loop_len-1)];
parityPtr = (simd_type_t*) &parity[nof_blocks*(loop_len-1)];
}
for (int k = loop_len - 1; k >= 0; k--) {
x = simd_load(inputPtr--);
y = simd_load(parityPtr--);
if (app) {
ap = simd_load(appPtr--);
x = simd_add(ap, x);
}
xy = simd_add(x, y);
m_b[0] = simd_add(old[4], xy);
m_b[1] = old[4];
m_b[2] = simd_add(old[5], y);
m_b[3] = simd_add(old[5], x);
m_b[4] = simd_add(old[6], x);
m_b[5] = simd_add(old[6], y);
m_b[6] = old[7];
m_b[7] = simd_add(old[7], xy);
new[0] = old[0];
new[1] = simd_add(old[0], xy);
new[2] = simd_add(old[1], x);
new[3] = simd_add(old[1], y);
new[4] = simd_add(old[2], y);
new[5] = simd_add(old[2], x);
new[6] = simd_add(old[3], xy);
new[7] = old[3];
// Calculate maximum metric
for (int i = 0; i < 8; i++) {
old[i] = simd_max(m_b[i], new[i]);
}
// Store metric only when doing the final pass
if (loop_len == long_sb) {
for (int i = 0; i < 8; i++) {
simd_store(&betaPtr[8*k + i], old[i]);
}
}
if (loop_len!=long_sb) {
debug_state_beta(0);
} else {
debug_state_beta(0);
}
// normalize
MAKE_FUNC(normalize)(k, old);
}
}
}
/* Computes alpha metrics */
static void MAKE_FUNC(alpha)(MAKE_TYPE * s, llr_t *input, llr_t *app, llr_t *parity, llr_t * output, uint32_t long_cb)
{
simd_type_t m_b[8], new[8], old[8], max1[8], max0[8];
simd_type_t x, y, xy, ap;
simd_type_t m1, m0;
simd_type_t *inputPtr;
simd_type_t *appPtr;
simd_type_t *parityPtr;
simd_type_t *betaPtr = (simd_type_t*) s->beta;
simd_type_t *outputPtr = (simd_type_t*) output;
#if debug_enabled_win
simd_type_t beta_save[8];
#endif
// Skip state 0
betaPtr+=8;
uint32_t loop_len;
for (int j=0;j<2;j++) {
// First run L states to find initial state for all sub-blocks after first
if (j==0) {
loop_len = win_overlap_len;
} else {
loop_len = long_sb;
}
// When passing through all window pick estimated initial states (known state for sb=0)
if (loop_len == long_sb) {
#ifdef WINIMP_IS_AVX16
llr_t tmp[8];
for (int i=0;i<8;i++) {
tmp[i] = _mm256_extract_epi16(old[i], 7);
}
#endif
#ifdef WINIMP_IS_AVX8
llr_t tmp[8];
for (int i=0;i<8;i++) {
tmp[i] = _mm256_extract_epi8(old[i], 15);
}
#endif
for (int i = 0; i < 8; i++) {
old[i] = simd_shuffle(old[i], move_left);
}
#ifdef WINIMP_IS_AVX16
for (int i=0;i<8;i++) {
old[i] = _mm256_insert_epi16(old[i], tmp[i], 8);
}
#endif
#ifdef WINIMP_IS_AVX8
for (int i=0;i<8;i++) {
old[i] = _mm256_insert_epi8(old[i], tmp[i], 16);
}
#endif
// 1st sub-block state is known
old[0] = simd_insert(old[0], 0, 0);
for (int i = 1; i < 8; i++) {
old[i] = simd_insert(old[i], -INF, 0);
}
} else {
// when estimating states, just set all to unknown
for (int i = 0; i < 8; i++) {
old[i] = simd_set1(-INF);
}
}
inputPtr = (simd_type_t*) &input[nof_blocks*(long_sb-loop_len)];
appPtr = (simd_type_t*) &app[nof_blocks*(long_sb-loop_len)];
parityPtr = (simd_type_t*) &parity[nof_blocks*(long_sb-loop_len)];
for (int k = 0; k < loop_len; k++) {
x = simd_load(inputPtr++);
y = simd_load(parityPtr++);
if (app) {
ap = simd_load(appPtr++);
x = simd_add(ap, x);
}
xy = simd_add(x,y);
m_b[0] = old[0];
m_b[1] = simd_add(old[3], y);
m_b[2] = simd_add(old[4], y);
m_b[3] = old[7];
m_b[4] = old[1];
m_b[5] = simd_add(old[2], y);
m_b[6] = simd_add(old[5], y);
m_b[7] = old[6];
new[0] = simd_add(old[1], xy);
new[1] = simd_add(old[2], x);
new[2] = simd_add(old[5], x);
new[3] = simd_add(old[6], xy);
new[4] = simd_add(old[0], xy);
new[5] = simd_add(old[3], x);
new[6] = simd_add(old[4], x);
new[7] = simd_add(old[7], xy);
// Load beta and compute output only when passing through all window
if (loop_len == long_sb) {
simd_type_t beta;
for (int i = 0; i < 8; i++) {
beta = simd_load(betaPtr++);
max0[i] = simd_add(beta, m_b[i]);
max1[i] = simd_add(beta, new[i]);
#if debug_enabled_win
beta_save[i] = beta;
#endif
}
m1 = simd_max(max1[0], max1[1]);
m0 = simd_max(max0[0], max0[1]);
for (int i = 2; i < 8; i++) {
m1 = simd_max(m1, max1[i]);
m0 = simd_max(m0, max0[i]);
}
simd_type_t out = simd_sub(m1, m0);
// Divide output when using 8-bit arithmetic
#ifdef divide_output
out = simd_rb_shift(out, divide_output);
#endif
simd_store(outputPtr++, out);
debug_state(0);
}
for (int i = 0; i < 8; i++) {
old[i] = simd_max(m_b[i], new[i]);
}
// normalize
MAKE_FUNC(normalize)(k, old);
if (loop_len != long_sb) {
debug_state_pre(0);
}
}
}
}
int MAKE_FUNC(init)(void **hh, uint32_t max_long_cb)
{
*hh = calloc(1, sizeof(MAKE_TYPE));
MAKE_TYPE *h = (MAKE_TYPE*) *hh;
h->beta = srslte_vec_malloc(sizeof(llr_t) * 8 * max_long_cb * nof_blocks);
if (!h->beta) {
perror("srslte_vec_malloc");
return -1;
}
h->max_long_cb = max_long_cb;
return nof_blocks;
}
void MAKE_FUNC(free)(void *hh)
{
MAKE_TYPE *h = (MAKE_TYPE*) hh;
if (h->beta) {
free(h->beta);
}
bzero(h, sizeof(MAKE_TYPE));
}
void MAKE_FUNC(dec)(void *hh, llr_t *input, llr_t *app, llr_t *parity, llr_t *output, uint32_t long_cb)
{
MAKE_TYPE *h = (MAKE_TYPE*) hh;
MAKE_FUNC(beta)(h, input, app, parity, long_cb);
MAKE_FUNC(alpha)(h, input, app, parity, output, long_cb);
#if debug_enabled_win
printf("running win decoder: %s\n", STRING(WINIMP));
#endif
}
#define INSERT8_INPUT(reg, st, off) reg = simd_insert(reg, input[3*(i+(st+0)*long_sb)+off], st+0);\
reg = simd_insert(reg, input[3*(i+(st+1)*long_sb)+off], st+1);\
reg = simd_insert(reg, input[3*(i+(st+2)*long_sb)+off], st+2);\
reg = simd_insert(reg, input[3*(i+(st+3)*long_sb)+off], st+3);\
reg = simd_insert(reg, input[3*(i+(st+4)*long_sb)+off], st+4);\
reg = simd_insert(reg, input[3*(i+(st+5)*long_sb)+off], st+5);\
reg = simd_insert(reg, input[3*(i+(st+6)*long_sb)+off], st+6);\
reg = simd_insert(reg, input[3*(i+(st+7)*long_sb)+off], st+7);
void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_t *parity_0, llr_t *parity_1, uint32_t long_cb)
{
simd_type_t *systPtr = (simd_type_t*) systematic;
simd_type_t *parity0Ptr = (simd_type_t*) parity_0;
simd_type_t *parity1Ptr = (simd_type_t*) parity_1;
simd_type_t syst, parity0, parity1;
for (int i=0;i<long_sb;i++) {
INSERT8_INPUT(syst, 0, 0);
INSERT8_INPUT(parity0, 0, 1);
INSERT8_INPUT(parity1, 0, 2);
#if nof_blocks >= 16
INSERT8_INPUT(syst, 8, 0);
INSERT8_INPUT(parity0, 8, 1);
INSERT8_INPUT(parity1, 8, 2);
#endif
#if nof_blocks >= 32
INSERT8_INPUT(syst, 16, 0);
INSERT8_INPUT(parity0, 16, 1);
INSERT8_INPUT(parity1, 16, 2);
INSERT8_INPUT(syst, 24, 0);
INSERT8_INPUT(parity0, 24, 1);
INSERT8_INPUT(parity1, 24, 2);
#endif
simd_store(systPtr++, syst);
simd_store(parity0Ptr++, parity0);
simd_store(parity1Ptr++, parity1);
}
for (int i = long_cb; i < long_cb + 3; i++) {
systematic[i] = input[3*long_cb + 2*(i - long_cb)];
parity_0[i] = input[3*long_cb + 2*(i - long_cb) + 1];
app2[i] = input[3*long_cb + 6 + 2*(i - long_cb)];
parity_1[i] = input[3*long_cb + 6 + 2*(i - long_cb) + 1];
}
}
#define deinter(x,win) ((x%(long_cb/win))*(win)+x/(long_cb/win))
#define reset_cnt(a,b) if(!((a+1)%b)) { \
k+=b*nof_blocks; \
if (k >= long_cb) { \
k -= (long_cb-1);\
}\
}
#define insert_bit(a,b) ap = _mm_insert_epi16(ap, app1[k+(a%b)*nof_blocks], 7-a); \
reset_cnt(a,b); \
#define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \
insert_bit(0,b);\
insert_bit(1,b);\
insert_bit(2,b);\
insert_bit(3,b);\
insert_bit(4,b);\
insert_bit(5,b);\
insert_bit(6,b);\
insert_bit(7,b);\
output[i] = (uint8_t) _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_packs_epi16(ap,zeros),zeros));\
}
/* No improvement to use AVX here */
void MAKE_FUNC(decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb)
{
uint32_t k=0;
__m128i zeros = _mm_setzero_si128();
__m128i ap;
if ((long_cb%(nof_blocks*8)) == 0) {
decide_for(8);
} else if ((long_cb%(nof_blocks*4)) == 0) {
decide_for(4);
} else if ((long_cb%(nof_blocks*2)) == 0) {
decide_for(2);
} else {
decide_for(1);
}
}
#undef WINIMP
#undef nof_blocks
#undef llr_t
#undef normalize_period
#undef INF
#undef win_overlap_len
#undef simd_type_t
#undef simd_load
#undef simd_store
#undef simd_add
#undef simd_sub
#undef simd_max
#undef simd_set1
#undef simd_insert
#undef simd_shuffle
#undef move_right
#undef move_left
#undef debug_enabled_win
#ifdef normalize_max
#undef normalize_max
#endif
#ifdef use_saturated_add
#undef use_saturated_add
#endif
#ifdef simd_rb_shift
#undef simd_rb_shift
#endif
#ifdef divide_output
#undef divide_output
#endif