Soft demodulator produces 16-bit fixed point

This commit is contained in:
ismagom 2015-10-12 19:03:20 +02:00
parent 19256c261e
commit f2b40c57ae
7 changed files with 239 additions and 71 deletions

View File

@ -84,7 +84,7 @@ SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h);
SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb);
SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h,
float * input,
llr_t * input,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_decision(srslte_tdec_t * h,
@ -96,7 +96,7 @@ SRSLTE_API void srslte_tdec_decision_byte(srslte_tdec_t * h,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h,
float * input,
llr_t * input,
uint8_t *output,
uint32_t nof_iterations,
uint32_t long_cb);

View File

@ -49,6 +49,11 @@ SRSLTE_API int srslte_demod_soft_demodulate(srslte_mod_t modulation,
float* llr,
int nsymbols);
SRSLTE_API int srslte_demod_soft_demodulate_s(srslte_mod_t modulation,
const cf_t* symbols,
short* llr,
int nsymbols);
/* High-level API */
typedef struct SRSLTE_API {
srslte_modem_table_t table;

View File

@ -47,22 +47,6 @@
#define INF 10000
#define ZERO 0
#define SCALE 100
static void print128_num(__m128i var)
{
int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t
printf("[%d %d %d %d %d %d %d %d]\n",
val[0], val[1], val[2], val[3], val[4], val[5],
val[6], val[7]);
}
void print128f_num(__m128 var)
{
float *val = (float*) &var;
printf("[%f %f %f %f]\n",
val[0], val[1], val[2], val[3]);
}
/************************************************
@ -434,11 +418,10 @@ void srslte_tdec_free(srslte_tdec_t * h)
bzero(h, sizeof(srslte_tdec_t));
}
void deinterleave_input(srslte_tdec_t *h, float *input, uint32_t long_cb) {
void deinterleave_input(srslte_tdec_t *h, short *input, uint32_t long_cb) {
uint32_t i;
float *inputPtr = input;
__m128 inf0, inf1, inf2, inf3, inf4, inf5;
__m128i *inputPtr = (__m128i*) input;
__m128i in0, in1, in2;
__m128i s0, s1, s2, s;
__m128i p00, p01, p02, p0;
@ -468,30 +451,14 @@ void deinterleave_input(srslte_tdec_t *h, float *input, uint32_t long_cb) {
__m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 3rd word
__m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
__m128 vScalar = _mm_set1_ps(SCALE);
// Split systematic and parity bits
for (i = 0; i < long_cb/8; i++) {
inf0 = _mm_load_ps(inputPtr); inputPtr+=4;
inf1 = _mm_load_ps(inputPtr); inputPtr+=4;
inf2 = _mm_load_ps(inputPtr); inputPtr+=4;
inf3 = _mm_load_ps(inputPtr); inputPtr+=4;
inf4 = _mm_load_ps(inputPtr); inputPtr+=4;
inf5 = _mm_load_ps(inputPtr); inputPtr+=4;
inf0 = _mm_mul_ps(inf0, vScalar);
inf1 = _mm_mul_ps(inf1, vScalar);
inf2 = _mm_mul_ps(inf2, vScalar);
inf3 = _mm_mul_ps(inf3, vScalar);
inf4 = _mm_mul_ps(inf4, vScalar);
inf5 = _mm_mul_ps(inf5, vScalar);
in0 = _mm_load_si128(inputPtr); inputPtr++;
in1 = _mm_load_si128(inputPtr); inputPtr++;
in2 = _mm_load_si128(inputPtr); inputPtr++;
in0 = _mm_packs_epi32(_mm_cvtps_epi32(inf0), _mm_cvtps_epi32(inf1));
in1 = _mm_packs_epi32(_mm_cvtps_epi32(inf2), _mm_cvtps_epi32(inf3));
in2 = _mm_packs_epi32(_mm_cvtps_epi32(inf4), _mm_cvtps_epi32(inf5));
/* Deinterleave Systematic bits */
s0 = _mm_shuffle_epi8(in0, s0_mask);
s1 = _mm_shuffle_epi8(in1, s1_mask);
@ -525,17 +492,17 @@ void deinterleave_input(srslte_tdec_t *h, float *input, uint32_t long_cb) {
}
for (i = 0; i < 3; i++) {
h->syst[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 2*i];
h->parity0[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 2*i + 1];
h->syst[i+long_cb] = input[3*long_cb + 2*i];
h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
}
for (i = 0; i < 3; i++) {
h->app2[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 6 + 2*i];
h->parity1[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 6 + 2*i + 1];
h->app2[i+long_cb] = input[3*long_cb + 6 + 2*i];
h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
}
}
void srslte_tdec_iteration(srslte_tdec_t * h, float * input, uint32_t long_cb)
void srslte_tdec_iteration(srslte_tdec_t * h, short * input, uint32_t long_cb)
{
if (h->current_cbidx >= 0) {
@ -639,7 +606,7 @@ void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long
}
}
int srslte_tdec_run_all(srslte_tdec_t * h, float * input, uint8_t *output,
int srslte_tdec_run_all(srslte_tdec_t * h, short * input, uint8_t *output,
uint32_t nof_iterations, uint32_t long_cb)
{
if (srslte_tdec_reset(h, long_cb)) {

View File

@ -112,6 +112,7 @@ void parse_args(int argc, char **argv) {
int main(int argc, char **argv) {
uint32_t frame_cnt;
float *llr;
short *llr_s;
uint8_t *llr_c;
uint8_t *data_tx, *data_rx, *data_rx_bytes, *symbols;
uint32_t i, j;
@ -173,6 +174,11 @@ int main(int argc, char **argv) {
perror("malloc");
exit(-1);
}
llr_s = srslte_vec_malloc(coded_length * sizeof(short));
if (!llr_s) {
perror("malloc");
exit(-1);
}
llr_c = srslte_vec_malloc(coded_length * sizeof(uint8_t));
if (!llr_c) {
perror("malloc");
@ -239,7 +245,10 @@ int main(int argc, char **argv) {
}
srslte_ch_awgn_f(llr, llr, var[i], coded_length);
for (j=0;j<coded_length;j++) {
llr_s[j] = (int16_t) (100*llr[j]);
}
/* decoder */
srslte_tdec_reset(&tdec, frame_length);
srslte_tdec_vl_reset(&tdec_vl, frame_length);
@ -253,7 +262,7 @@ int main(int argc, char **argv) {
gettimeofday(&tdata[1], NULL);
for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_run_all(&tdec, llr, data_rx_bytes, t, frame_length);
srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);
}
gettimeofday(&tdata[2], NULL);
get_time_interval(tdata);
@ -262,7 +271,7 @@ int main(int argc, char **argv) {
srslte_bit_unpack_vector(data_rx_bytes, data_rx, frame_length);
errors += srslte_bit_diff(data_tx, data_rx, frame_length);
gettimeofday(&tdata[1], NULL);
for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_vl_run_all(&tdec_vl, llr, data_rx, t, frame_length);

View File

@ -33,8 +33,23 @@
#include "srslte/utils/bit.h"
#include "srslte/modem/demod_soft.h"
#define HAVE_SIMD
#ifdef HAVE_SIMD
#include <xmmintrin.h>
#include <tmmintrin.h>
#endif
//#define SCALE_DEMOD16QAM
#define SCALE_SHORT_CONV 100
void demod_bpsk_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
for (int i=0;i<nsymbols;i++) {
llr[i] = (short) -SCALE_SHORT_CONV*(crealf(symbols[i]) + cimagf(symbols[i]))/sqrt(2);
}
}
void demod_bpsk_lte(const cf_t *symbols, float *llr, int nsymbols) {
for (int i=0;i<nsymbols;i++) {
@ -42,6 +57,11 @@ void demod_bpsk_lte(const cf_t *symbols, float *llr, int nsymbols) {
}
}
void demod_qpsk_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
srslte_vec_fprint_f(stdout, (float*) symbols, nsymbols*2);
srslte_vec_convert_fi((float*) symbols, llr, -SCALE_SHORT_CONV*sqrt(2), nsymbols*2);
}
void demod_qpsk_lte(const cf_t *symbols, float *llr, int nsymbols) {
srslte_vec_sc_prod_fff((float*) symbols, -sqrt(2), llr, nsymbols*2);
}
@ -79,6 +99,62 @@ void demod_16qam_lte(const cf_t *symbols, float *llr, int nsymbols) {
}
}
void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
#ifndef HAVE_SIMD
for (int i=0;i<nsymbols;i++) {
short yre = (short) (SCALE_SHORT_CONV*crealf(symbols[i]));
short yim = (short) (SCALE_SHORT_CONV*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV/sqrt(10);
llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV/sqrt(10);
}
#else
float *symbolsPtr = (float*) symbols;
__m128i *resultPtr = (__m128i*) llr;
__m128 symbol1, symbol2;
__m128i symbol_i1, symbol_i2, symbol_i, symbol_abs;
__m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV/sqrt(10));
__m128i result11, result12, result22, result21;
__m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV);
__m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0);
__m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8);
__m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);
__m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff);
for (int i=0;i<nsymbols/4;i++) {
symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
symbol_i = _mm_packs_epi32(symbol_i1, symbol_i2);
symbol_abs = _mm_abs_epi16(symbol_i);
symbol_abs = _mm_sub_epi16(symbol_abs, offset);
result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
_mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++;
}
// Demodulate last symbols
for (int i=4*(nsymbols/4);i<nsymbols;i++) {
short yre = (short) (SCALE_SHORT_CONV*crealf(symbols[i]));
short yim = (short) (SCALE_SHORT_CONV*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV/sqrt(10);
llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV/sqrt(10);
}
#endif
}
void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
{
for (int i=0;i<nsymbols;i++) {
@ -95,6 +171,83 @@ void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
}
void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
{
#ifndef HAVE_SIMD
for (int i=0;i<nsymbols;i++) {
float yre = (short) (SCALE_SHORT_CONV*crealf(symbols[i]));
float yim = (short) (SCALE_SHORT_CONV*cimagf(symbols[i]));
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV/sqrt(42);
llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV/sqrt(42);
llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV/sqrt(42);
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV/sqrt(42);
}
#else
float *symbolsPtr = (float*) symbols;
__m128i *resultPtr = (__m128i*) llr;
__m128 symbol1, symbol2;
__m128i symbol_i1, symbol_i2, symbol_i, symbol_abs, symbol_abs2;
__m128i offset1 = _mm_set1_epi16(4*SCALE_SHORT_CONV/sqrt(42));
__m128i offset2 = _mm_set1_epi16(2*SCALE_SHORT_CONV/sqrt(42));
__m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV);
__m128i result11, result12, result13, result22, result21,result23, result31, result32, result33;
__m128i shuffle_negated_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,3,2,1,0);
__m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
__m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff);
__m128i shuffle_abs_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);
__m128i shuffle_abs_2 = _mm_set_epi8(11,10,9,8,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,7,6,5,4);
__m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
__m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
__m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff);
__m128i shuffle_abs2_3 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,9,8);
for (int i=0;i<nsymbols/4;i++) {
symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
symbol_i = _mm_packs_epi32(symbol_i1, symbol_i2);
symbol_abs = _mm_abs_epi16(symbol_i);
symbol_abs = _mm_sub_epi16(symbol_abs, offset1);
symbol_abs2 = _mm_sub_epi16(_mm_abs_epi16(symbol_abs), offset2);
result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);
result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);
result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);
result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);
result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;
}
for (int i=4*(nsymbols/4);i<nsymbols;i++) {
float yre = (short) (SCALE_SHORT_CONV*crealf(symbols[i]));
float yim = (short) (SCALE_SHORT_CONV*cimagf(symbols[i]));
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV/sqrt(42);
llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV/sqrt(42);
llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV/sqrt(42);
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV/sqrt(42);
}
#endif
}
int srslte_demod_soft_demodulate(srslte_mod_t modulation, const cf_t* symbols, float* llr, int nsymbols) {
switch(modulation) {
case SRSLTE_MOD_BPSK:
@ -116,6 +269,27 @@ int srslte_demod_soft_demodulate(srslte_mod_t modulation, const cf_t* symbols, f
return 0;
}
int srslte_demod_soft_demodulate_s(srslte_mod_t modulation, const cf_t* symbols, short* llr, int nsymbols) {
switch(modulation) {
case SRSLTE_MOD_BPSK:
demod_bpsk_lte_s(symbols, llr, nsymbols);
break;
case SRSLTE_MOD_QPSK:
demod_qpsk_lte_s(symbols, llr, nsymbols);
break;
case SRSLTE_MOD_16QAM:
demod_16qam_lte_s(symbols, llr, nsymbols);
break;
case SRSLTE_MOD_64QAM:
demod_64qam_lte_s(symbols, llr, nsymbols);
break;
default:
fprintf(stderr, "Invalid modulation %d\n", modulation);
return -1;
}
return 0;
}
/* High-Level API */
int srslte_demod_soft_initialize(srslte_demod_soft_hl* hl) {

View File

@ -112,6 +112,7 @@ int main(int argc, char **argv) {
uint8_t *input, *output;
cf_t *symbols;
float *llr;
short *llr_s;
parse_args(argc, argv);
@ -125,34 +126,41 @@ int main(int argc, char **argv) {
num_bits = mod.nbits_x_symbol * (num_bits / mod.nbits_x_symbol);
/* allocate buffers */
input = malloc(sizeof(uint8_t) * num_bits);
input = srslte_vec_malloc(sizeof(uint8_t) * num_bits);
if (!input) {
perror("malloc");
exit(-1);
}
output = malloc(sizeof(uint8_t) * num_bits);
output = srslte_vec_malloc(sizeof(uint8_t) * num_bits);
if (!output) {
perror("malloc");
exit(-1);
}
symbols = malloc(sizeof(cf_t) * num_bits / mod.nbits_x_symbol);
symbols = srslte_vec_malloc(sizeof(cf_t) * num_bits / mod.nbits_x_symbol);
if (!symbols) {
perror("malloc");
exit(-1);
}
llr = malloc(sizeof(float) * num_bits);
llr = srslte_vec_malloc(sizeof(float) * num_bits);
if (!llr) {
perror("malloc");
exit(-1);
}
llr_s = srslte_vec_malloc(sizeof(short) * num_bits);
if (!llr_s) {
perror("malloc");
exit(-1);
}
/* generate random data */
srand(0);
int ret = -1;
struct timeval t[3];
float mean_texec = 0.0;
float mean_texec_s = 0.0;
for (int n=0;n<nof_frames;n++) {
for (i=0;i<num_bits;i++) {
input[i] = rand()%2;
@ -165,11 +173,20 @@ int main(int argc, char **argv) {
srslte_demod_soft_demodulate(modulation, symbols, llr, num_bits / mod.nbits_x_symbol);
gettimeofday(&t[2], NULL);
get_time_interval(t);
/* compute exponentially averaged execution time */
if (n > 0) {
mean_texec = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec, n-1);
}
gettimeofday(&t[1], NULL);
srslte_demod_soft_demodulate_s(modulation, symbols, llr_s, num_bits / mod.nbits_x_symbol);
gettimeofday(&t[2], NULL);
get_time_interval(t);
if (n > 0) {
mean_texec_s = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec_s, n-1);
}
if (SRSLTE_VERBOSE_ISDEBUG()) {
printf("bits=");
@ -180,6 +197,10 @@ int main(int argc, char **argv) {
printf("llr=");
srslte_vec_fprint_f(stdout, llr, num_bits);
printf("llr_s=");
srslte_vec_fprint_s(stdout, llr_s, num_bits);
}
// Check demodulation errors
@ -200,6 +221,7 @@ clean_exit:
srslte_modem_table_free(&mod);
printf("Mean Throughput: %.2f. Mbps ExTime: %.2f us\n", num_bits/mean_texec, mean_texec);
printf("Mean Throughput: %.2f/%.2f. Mbps ExTime: %.2f/%.2f us\n",
num_bits/mean_texec, num_bits/mean_texec_s, mean_texec, mean_texec_s);
exit(ret);
}

View File

@ -168,35 +168,31 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
/* Modified from volk_32f_s32f_convert_16i_a_sse2. Removed clipping */
void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
{
unsigned int number = 0;
unsigned int number = 0;
const unsigned int eighthPoints = len / 8;
const float* inputVectorPtr = (const float*)x;
int16_t* outputVectorPtr = z;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scale);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
// Scale and clip
ret1 = _mm_mul_ps(inputVal1, vScalar);
ret2 = _mm_mul_ps(inputVal2, vScalar);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
printf("intinput: "); print128_num(intInputVal1);
printf("intinput2: "); print128_num(intInputVal2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
@ -204,13 +200,8 @@ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
}
number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
for(; number < len; number++){
z[number] = (int16_t) (x[number] * scale);
}
}