diff --git a/lib/include/srslte/phy/fec/viterbi.h b/lib/include/srslte/phy/fec/viterbi.h index 043a6f9f9..d69750fb3 100644 --- a/lib/include/srslte/phy/fec/viterbi.h +++ b/lib/include/srslte/phy/fec/viterbi.h @@ -106,6 +106,12 @@ SRSLTE_API int srslte_viterbi_init_neon(srslte_viterbi_t *q, uint32_t max_frame_length, bool tail_bitting); +SRSLTE_API int srslte_viterbi_init_avx2(srslte_viterbi_t *q, + srslte_viterbi_type_t type, + int poly[3], + uint32_t max_frame_length, + bool tail_bitting); + #endif diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h index cd6eb4d28..3ecdf7b59 100644 --- a/lib/include/srslte/phy/utils/vector_simd.h +++ b/lib/include/srslte/phy/utils/vector_simd.h @@ -36,19 +36,45 @@ extern "C" { #include "srslte/config.h" SRSLTE_API int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len); + +SRSLTE_API int srslte_vec_dot_prod_sss_simd_avx(short *x, short *y, uint32_t len); + + SRSLTE_API void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sum_sss_simd_avx(short *x, short *y, short *z, uint32_t len); + + + SRSLTE_API void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sub_sss_simd_avx(short *x, short *y, short *z, uint32_t len); + + + + + SRSLTE_API void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_sss_simd_avx(short *x, short *y, short *z, uint32_t len); + + SRSLTE_API void srslte_vec_sc_div2_sss_simd(short *x, int n_rightshift, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sc_div2_sss_simd_avx(short *x, int k, short *z, uint32_t len); + + + + + SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t len); SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len); + + +SRSLTE_API void srslte_32fc_s32f_multiply_32fc_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len); #ifdef __cplusplus } #endif diff --git a/lib/src/phy/fec/viterbi.c b/lib/src/phy/fec/viterbi.c index baebed0a1..444c29338 100644 --- a/lib/src/phy/fec/viterbi.c +++ b/lib/src/phy/fec/viterbi.c @@ -42,6 +42,14 @@ #define DEFAULT_GAIN 100 + +#define AVX_ON + +#ifdef LV_HAVE_AVX + #ifdef AVX_ON + #define USE_AVX + #endif +#endif //#undef LV_HAVE_SSE int decode37(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) { @@ -120,6 +128,51 @@ void free37_sse(void *o) { #endif + +#ifdef LV_HAVE_AVX +int decode37_avx2(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) { + srslte_viterbi_t *q = o; + + uint32_t best_state; + + if (frame_length > q->framebits) { + fprintf(stderr, "Initialized decoder for max frame length %d bits\n", + q->framebits); + return -1; + } + + /* Initialize Viterbi decoder */ + init_viterbi37_avx2(q->ptr, q->tail_biting?-1:0); + + /* Decode block */ + if (q->tail_biting) { + for (int i=0;itmp[i*3*frame_length], symbols, 3*frame_length*sizeof(uint8_t)); + } + update_viterbi37_blk_avx2(q->ptr, q->tmp, TB_ITER*frame_length, &best_state); + chainback_viterbi37_avx2(q->ptr, q->tmp, TB_ITER*frame_length, best_state); + memcpy(data, &q->tmp[((int) (TB_ITER/2))*frame_length], frame_length*sizeof(uint8_t)); + } else { + update_viterbi37_blk_avx2(q->ptr, symbols, frame_length+q->K-1, NULL); + chainback_viterbi37_avx2(q->ptr, data, frame_length, 0); + } + + return q->framebits; +} + +void free37_avx2(void *o) { + srslte_viterbi_t *q = o; + if (q->symbols_uc) { + free(q->symbols_uc); + } + if (q->tmp) { + free(q->tmp); + } + delete_viterbi37_avx2(q->ptr); +} + +#endif + #ifdef HAVE_NEON int decode37_neon(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) { srslte_viterbi_t *q = o; @@ -286,6 +339,45 @@ int init37_neon(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_ } #endif + +#ifdef LV_HAVE_AVX +int init37_avx2(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_biting) { + q->K = 7; + q->R = 3; + q->framebits = framebits; + q->gain_quant_s = 4; + q->gain_quant = DEFAULT_GAIN; + q->tail_biting = tail_biting; + q->decode = decode37_avx2; + q->free = free37_avx2; + q->decode_f = NULL; + printf("USING AVX VITERBI\n"); + q->symbols_uc = srslte_vec_malloc(3 * (q->framebits + q->K - 1) * sizeof(uint8_t)); + if (!q->symbols_uc) { + perror("malloc"); + return -1; + } + if (q->tail_biting) { + q->tmp = srslte_vec_malloc(TB_ITER*3*(q->framebits + q->K - 1) * sizeof(uint8_t)); + if (!q->tmp) { + perror("malloc"); + free37(q); + return -1; + } + } else { + q->tmp = NULL; + } + + if ((q->ptr = create_viterbi37_avx2(poly, TB_ITER*framebits)) == NULL) { + fprintf(stderr, "create_viterbi37 failed\n"); + free37(q); + return -1; + } else { + return 0; + } +} +#endif + void srslte_viterbi_set_gain_quant(srslte_viterbi_t *q, float gain_quant) { q->gain_quant = gain_quant; } @@ -299,7 +391,11 @@ int srslte_viterbi_init(srslte_viterbi_t *q, srslte_viterbi_type_t type, int pol switch (type) { case SRSLTE_VITERBI_37: #ifdef LV_HAVE_SSE - return init37_sse(q, poly, max_frame_length, tail_bitting); + #ifdef USE_AVX + return init37_avx2(q, poly, max_frame_length, tail_bitting); + #else + return init37_sse(q, poly, max_frame_length, tail_bitting); + #endif #else #ifdef HAVE_NEON return init37_neon(q, poly, max_frame_length, tail_bitting); @@ -320,6 +416,13 @@ int srslte_viterbi_init_sse(srslte_viterbi_t *q, srslte_viterbi_type_t type, int } #endif +#ifdef LV_HAVE_AVX +int srslte_viterbi_init_avx2(srslte_viterbi_t *q, srslte_viterbi_type_t type, int poly[3], uint32_t max_frame_length, bool tail_bitting) +{ + return init37_avx2(q, poly, max_frame_length, tail_bitting); +} +#endif + void srslte_viterbi_free(srslte_viterbi_t *q) { if (q->free) { q->free(q); diff --git a/lib/src/phy/fec/viterbi37.h b/lib/src/phy/fec/viterbi37.h index 2c7f8c57f..574f4fd87 100644 --- a/lib/src/phy/fec/viterbi37.h +++ b/lib/src/phy/fec/viterbi37.h @@ -88,3 +88,25 @@ int update_viterbi37_blk_neon(void *p, uint32_t *best_state); +void *create_viterbi37_avx2(int polys[3], + uint32_t len); + +int init_viterbi37_avx2(void *p, + int starting_state); + + +void reset_blk_avx2(void *p, int nbits); + +int chainback_viterbi37_avx2(void *p, + uint8_t *data, + uint32_t nbits, + uint32_t endstate); + +void delete_viterbi37_avx2(void *p); + +int update_viterbi37_blk_avx2(void *p, + uint8_t *syms, + uint32_t nbits, + uint32_t *best_state); + + diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index a5921aeda..12809950b 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -109,7 +109,7 @@ void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) { z[i] = x[i]-y[i]; } #else - srslte_vec_sub_sss_simd(x, y, z, len); + srslte_vec_sub_sss_simd_avx(x, y, z, len); #endif } @@ -135,7 +135,7 @@ void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len) { z[i] = x[i]+y[i]; } #else - srslte_vec_sum_sss_simd(x, y, z, len); + srslte_vec_sum_sss_simd_avx(x, y, z, len); #endif } @@ -204,7 +204,7 @@ void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) z[i] = x[i]/pow2_div; } #else - srslte_vec_sc_div2_sss_simd(x, n_rightshift, z, len); + srslte_vec_sc_div2_sss_simd_avx(x, n_rightshift, z, len); #endif } @@ -226,10 +226,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) { z[i] = x[i]*h; } #else - cf_t hh; - __real__ hh = h; - __imag__ hh = 0; - volk_32fc_s32fc_multiply_32fc(z,x,hh,len); + srslte_32fc_s32f_multiply_32fc_avx(z,x, h, len); #endif } @@ -514,7 +511,7 @@ void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) { z[i] = x[i]*y[i]; } #else - srslte_vec_prod_sss_simd(x,y,z,len); + srslte_vec_prod_sss_simd_avx(x,y,z,len); #endif } @@ -653,7 +650,7 @@ int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len) { } return res; #else - return srslte_vec_dot_prod_sss_simd(x, y, len); + return srslte_vec_dot_prod_sss_simd_avx(x, y, len); #endif } diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index 817dfa2da..81694b110 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -40,6 +40,9 @@ #include #endif +#ifdef LV_HAVE_AVX +#include +#endif int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len) @@ -83,6 +86,47 @@ int srslte_vec_dot_prod_sss_simd(short *x, short *y, uint32_t len) return result; } + +int srslte_vec_dot_prod_sss_simd_avx(short *x, short *y, uint32_t len) +{ + int result = 0; +#ifdef LV_HAVE_AVX + unsigned int number = 0; + const unsigned int points = len / 16; + + const __m256i* xPtr = (const __m256i*) x; + const __m256i* yPtr = (const __m256*) y; + + __m256i dotProdVal = _mm256_setzero_si256(); + + __m256i xVal, yVal, zVal; + for(;number < points; number++){ + + xVal = _mm256_load_si256(xPtr); + yVal = _mm256_loadu_si256(yPtr); + zVal = _mm256_mullo_epi16(xVal, yVal); + dotProdVal = _mm256_add_epi16(dotProdVal, zVal); + xPtr ++; + yPtr ++; + } + + short dotProdVector[16]; + _mm256_store_si256((__m256i*) dotProdVector, dotProdVal); + for (int i=0;i<16;i++) { + result += dotProdVector[i]; + } + + number = points * 16; + for(;number < len; number++){ + result += (x[number] * y[number]); + } + +#endif + return result; +} + + + void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE @@ -116,6 +160,39 @@ void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len) } +void srslte_vec_sum_sss_simd_avx(short *x, short *y, short *z, uint32_t len) +{ +#ifdef LV_HAVE_SSE + unsigned int number = 0; + const unsigned int points = len / 16; + + const __m256i* xPtr = (const __m256i*) x; + const __m256i* yPtr = (const __m256i*) y; + __m256i* zPtr = (__m256i*) z; + + __m256i xVal, yVal, zVal; + for(;number < points; number++){ + + xVal = _mm256_load_si256(xPtr); + yVal = _mm256_loadu_si256(yPtr); + + zVal = _mm256_add_epi16(xVal, yVal); + _mm256_store_si256(zPtr, zVal); + + xPtr ++; + yPtr ++; + zPtr ++; + } + + number = points * 16; + for(;number < len; number++){ + z[number] = x[number] + y[number]; + } +#endif + +} + + void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE @@ -148,6 +225,41 @@ void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len) #endif } +void srslte_vec_sub_sss_simd_avx(short *x, short *y, short *z, uint32_t len) +{ +#ifdef LV_HAVE_AVX + unsigned int number = 0; + const unsigned int points = len / 16; + + const __m256i* xPtr = (const __m256i*) x; + const __m256i* yPtr = (const __m256i*) y; + __m256i* zPtr = (__m256i*) z; + + __m256i xVal, yVal, zVal; + for(;number < points; number++){ + + xVal = _mm256_load_si256(xPtr); + yVal = _mm256_loadu_si256(yPtr); + + zVal = _mm256_sub_epi16(xVal, yVal); + + _mm256_store_si256(zPtr, zVal); + + xPtr ++; + yPtr ++; + zPtr ++; + } + + number = points * 16; + for(;number < len; number++){ + z[number] = x[number] - y[number]; + } + #endif +} + + + + void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE @@ -180,6 +292,38 @@ void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len) #endif } +void srslte_vec_prod_sss_simd_avx(short *x, short *y, short *z, uint32_t len) +{ +#ifdef LV_HAVE_SSE + unsigned int number = 0; + const unsigned int points = len / 16; + + const __m256i* xPtr = (const __m256i*) x; + const __m256i* yPtr = (const __m256i*) y; + __m256i* zPtr = (__m256i*) z; + + __m256i xVal, yVal, zVal; + for(;number < points; number++){ + + xVal = _mm256_load_si256(xPtr); + yVal = _mm256_loadu_si256(yPtr); + + zVal = _mm256_mullo_epi16(xVal, yVal); + + _mm256_store_si256(zPtr, zVal); + + xPtr ++; + yPtr ++; + zPtr ++; + } + + number = points * 16; + for(;number < len; number++){ + z[number] = x[number] * y[number]; + } +#endif +} + void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len) { #ifdef LV_HAVE_SSE @@ -210,6 +354,36 @@ void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len) #endif } +void srslte_vec_sc_div2_sss_simd_avx(short *x, int k, short *z, uint32_t len) +{ +#ifdef LV_HAVE_AVX + unsigned int number = 0; + const unsigned int points = len / 16; + + const __m256i* xPtr = (const __m256i*) x; + __m256i* zPtr = (__m256i*) z; + + __m256i xVal, zVal; + for(;number < points; number++){ + + xVal = _mm256_load_si256(xPtr); + + zVal = _mm256_srai_epi16(xVal, k); + + _mm256_store_si256(zPtr, zVal); + + xPtr ++; + zPtr ++; + } + + number = points * 16; + short divn = (1< +#include +#include +#include +#include +#include "parity.h" + +//#define DEBUG + +#ifdef LV_HAVE_SSE + +#include +#include +#include +#include +#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1) + +#define _mm256_setr_m128i(v0, v1) _mm256_set_m128i((v1), (v0)) + +typedef union { + unsigned char c[64]; + __m128i v[4]; +} metric_t; +typedef union { + unsigned int w[2]; + unsigned char c[8]; + unsigned short s[4]; + __m64 v; +} decision_t; + +union branchtab27 { + unsigned char c[32]; + __m256i v; +} Branchtab37_sse2[3]; + +int firstGo; +/* State info for instance of Viterbi decoder */ +struct v37 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ + uint32_t len; +}; + +void set_viterbi37_polynomial_avx2(int polys[3]) { + int state; + + for(state=0;state < 32;state++){ + Branchtab37_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab37_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab37_sse2[2].c[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } +} + +void clear_v37_avx2(struct v37 *vp) { + bzero(vp->decisions, sizeof(decision_t)*vp->len); + vp->dp = NULL; + bzero(&vp->metrics1, sizeof(metric_t)); + bzero(&vp->metrics2, sizeof(metric_t)); + vp->old_metrics = NULL; + vp->new_metrics = NULL; +} + + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi37_avx2(void *p, int starting_state) { + struct v37 *vp = p; + uint32_t i; + firstGo = 1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + clear_v37_avx2(vp); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + if (starting_state != -1) { + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + } + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi37_avx2(int polys[3], uint32_t len) { + void *p; + struct v37 *vp; + + set_viterbi37_polynomial_avx2(polys); + + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v37))) + return NULL; + + vp = (struct v37 *)p; + if(posix_memalign(&p, sizeof(__m128i),(len+6)*sizeof(decision_t))) { + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + vp->len = len+6; + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi37_avx2( + void *p, + uint8_t *data, /* Decoded output data */ + uint32_t nbits, /* Number of data bits */ + uint32_t endstate) { /* Terminal encoder state */ + struct v37 *vp = p; + + if (p == NULL) + return -1; + + decision_t *d = (decision_t *)vp->decisions; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits--) { + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + endstate = (endstate >> 1) | (k << 7); + data[nbits] = k; + //printf("nbits=%d, endstate=%3d, k=%d, w[0]=%d, w[1]=%d, c=%d\n", nbits, endstate, k, d[nbits].s[1]&1, d[nbits].s[2]&1, d[nbits].c[(endstate>>2)/8]&1); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi37_avx2(void *p){ + struct v37 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} +void printer_256i(char *s, __m256i val) { + + printf("%s: ", s); + + uint8_t *x = (uint8_t*) &val; + for (int i=0;i<32;i++) { + printf("%3d, ", x[i]); + } + printf("\n"); +} + +void printer_128i(char *s, __m128i val) { + + printf("%s: ", s); + + uint8_t *x = (uint8_t*) &val; + for (int i=0;i<16;i++) { + printf("%3d, ", x[i]); + } + printf("\n"); +} + +void printer_m64(char *s, __m64 val) { + + printf("%s: ", s); + + uint8_t *x = (uint8_t*) &val; + for (int i=0;i<8;i++) { + printf("%3d, ", x[i]); + } + printf("\n"); +} + + +void update_viterbi37_blk_avx2(void *p,unsigned char *syms,int nbits, uint32_t *best_state) { + struct v37 *vp = p; + decision_t *d; + + if(p == NULL) + return; + +#ifdef DEBUG + printf("["); +#endif + + d = (decision_t *) vp->dp; + + for (int s=0;sold_metrics->v[1], vp->old_metrics->v[0]); + m0 = _mm256_add_epi8(temp,metric); + m2 = _mm256_add_epi8(temp,m_metric); + + temp = _mm256_set_m128i( vp->old_metrics->v[3], vp->old_metrics->v[2]); + m3 = _mm256_add_epi8(temp,metric); + m1 = _mm256_add_epi8(temp,m_metric); + + /* Compare and select, using modulo arithmetic */ + decision0 = _mm256_cmpgt_epi8(_mm256_sub_epi8(m0,m1),_mm256_setzero_si256()); + decision1 =_mm256_cmpgt_epi8(_mm256_sub_epi8(m2,m3),_mm256_setzero_si256()); + survivor0 = _mm256_or_si256(_mm256_and_si256(decision0,m1),_mm256_andnot_si256(decision0,m0)); + survivor1 = _mm256_or_si256(_mm256_and_si256(decision1,m3),_mm256_andnot_si256(decision1,m2)); + + unsigned int x = _mm256_movemask_epi8(_mm256_unpackhi_epi8(decision0,decision1)); + unsigned int y = _mm256_movemask_epi8(_mm256_unpacklo_epi8(decision0,decision1)); + + d->s[0] = (short) y; + d->s[1] = (short) x; + d->s[2] = (short) (y >>16); + d->s[3] = (short)(x>> 16); + + + __m256i unpack; + unpack = _mm256_unpacklo_epi8(survivor0,survivor1); + vp->new_metrics->v[0] =_mm256_castsi256_si128(unpack); + + vp->new_metrics->v[1] = _mm256_extractf128_si256(unpack,1); + + unpack = _mm256_unpackhi_epi8(survivor0,survivor1); + + vp->new_metrics->v[2] =_mm256_castsi256_si128(unpack); + vp->new_metrics->v[3] = _mm256_extractf128_si256(unpack,1); + + __m128i temp1 = vp->new_metrics->v[1]; + + vp->new_metrics->v[1] = vp->new_metrics->v[2]; + vp->new_metrics->v[2] = temp1; + + // See if we need to normalize + if (vp->new_metrics->c[0] > 100) { + int i; + uint8_t adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + adjustv = vp->new_metrics->v[0]; + for(i=1;i<4;i++) { + adjustv = _mm_min_epu8(adjustv,vp->new_metrics->v[i]); + } + + adjustv = _mm_min_epu8(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epu8(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epu8(adjustv,_mm_srli_si128(adjustv,2)); + + t.v = adjustv; + adjust = t.w[0]; + adjustv = _mm_set1_epi8(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<4;i++) + vp->new_metrics->v[i] = _mm_sub_epi8(vp->new_metrics->v[i],adjustv); + + } + + firstGo = 0; + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + + if (best_state) { + uint32_t i, bst=0; + uint8_t minmetric=UINT8_MAX; + for (i=0;i<64;i++) { + if (vp->old_metrics->c[i] <= minmetric) { + bst = i; + minmetric = vp->old_metrics->c[i]; + } + } + *best_state = bst; + } + + #ifdef DEBUG + printf("];\n===========================================\n"); +#endif + + vp->dp = d; +} + +#endif + + +