00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_mat_mult_fast_q15.c 00009 * 00010 * Description: Q15 matrix multiplication (fast variant) 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * -------------------------------------------------------------------- */ 00029 00030 #include "arm_math.h" 00031 00074 arm_status arm_mat_mult_fast_q15( 00075 const arm_matrix_instance_q15 * pSrcA, 00076 const arm_matrix_instance_q15 * pSrcB, 00077 arm_matrix_instance_q15 * pDst, 00078 q15_t * pState) 00079 { 00080 q31_t sum; /* accumulator */ 00081 q31_t in; /* Temporary variable to hold the input value */ 00082 q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */ 00083 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */ 00084 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */ 00085 // q15_t *pDst = pDst->pData; /* output data matrix pointer */ 00086 q15_t *px; /* Temporary output data matrix pointer */ 00087 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ 00088 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ 00089 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ 00090 uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ 00091 uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ 00092 arm_status status; /* status of matrix multiplication */ 00093 00094 #ifdef ARM_MATH_MATRIX_CHECK 00095 00096 00097 /* Check for matrix mismatch condition */ 00098 00099 if((pSrcA->numCols != pSrcB->numRows) || 00100 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) 00101 { 00102 /* Set status as ARM_MATH_SIZE_MISMATCH */ 00103 status = ARM_MATH_SIZE_MISMATCH; 00104 } 00105 else 00106 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ 00107 00108 { 00109 /* Matrix transpose */ 00110 do 00111 { 00112 /* Apply loop unrolling and exchange the columns with row elements */ 00113 col = numColsB >> 2; 00114 00115 /* The pointer px is set to starting address of the column being processed */ 00116 px = pSrcBT + i; 00117 00118 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00119 ** a second loop below computes the remaining 1 to 3 samples. */ 00120 while(col > 0u) 00121 { 00122 /* Read two elements from the row */ 00123 in = *__SIMD32(pInB)++; 00124 00125 /* Unpack and store one element in the destination */ 00126 #ifndef ARM_MATH_BIG_ENDIAN 00127 00128 *px = (q15_t) in; 00129 00130 #else 00131 00132 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00133 00134 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00135 00136 /* Update the pointer px to point to the next row of the transposed matrix */ 00137 px += numRowsB; 00138 00139 /* Unpack and store the second element in the destination */ 00140 #ifndef ARM_MATH_BIG_ENDIAN 00141 00142 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00143 00144 #else 00145 00146 *px = (q15_t) in; 00147 00148 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00149 00150 00151 /* Update the pointer px to point to the next row of the transposed matrix */ 00152 px += numRowsB; 00153 00154 /* Read two elements from the row */ 00155 in = *__SIMD32(pInB)++; 00156 00157 /* Unpack and store one element in the destination */ 00158 #ifndef ARM_MATH_BIG_ENDIAN 00159 00160 *px = (q15_t) in; 00161 00162 #else 00163 00164 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00165 00166 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00167 00168 /* Update the pointer px to point to the next row of the transposed matrix */ 00169 px += numRowsB; 00170 00171 /* Unpack and store the second element in the destination */ 00172 00173 #ifndef ARM_MATH_BIG_ENDIAN 00174 00175 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); 00176 00177 #else 00178 00179 *px = (q15_t) in; 00180 00181 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00182 00183 /* Update the pointer px to point to the next row of the transposed matrix */ 00184 px += numRowsB; 00185 00186 /* Decrement the column loop counter */ 00187 col--; 00188 } 00189 00190 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. 00191 ** No loop unrolling is used. */ 00192 col = numColsB % 0x4u; 00193 00194 while(col > 0u) 00195 { 00196 /* Read and store the input element in the destination */ 00197 *px = *pInB++; 00198 00199 /* Update the pointer px to point to the next row of the transposed matrix */ 00200 px += numRowsB; 00201 00202 /* Decrement the column loop counter */ 00203 col--; 00204 } 00205 00206 i++; 00207 00208 /* Decrement the row loop counter */ 00209 row--; 00210 00211 } while(row > 0u); 00212 00213 /* Reset the variables for the usage in the following multiplication process */ 00214 row = numRowsA; 00215 i = 0u; 00216 px = pDst->pData; 00217 00218 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ 00219 /* row loop */ 00220 do 00221 { 00222 /* For every row wise process, the column loop counter is to be initiated */ 00223 col = numColsB; 00224 00225 /* For every row wise process, the pIn2 pointer is set 00226 ** to the starting address of the transposed pSrcB data */ 00227 pInB = pSrcBT; 00228 00229 /* column loop */ 00230 do 00231 { 00232 /* Set the variable sum, that acts as accumulator, to zero */ 00233 sum = 0; 00234 00235 /* Apply loop unrolling and compute 2 MACs simultaneously. */ 00236 colCnt = numColsA >> 1; 00237 00238 /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ 00239 pInA = pSrcA->pData + i; 00240 00241 /* matrix multiplication */ 00242 while(colCnt > 0u) 00243 { 00244 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00245 sum = __SMLAD(*__SIMD32(pInA)++, *__SIMD32(pInB)++, sum); 00246 00247 /* Decrement the loop counter */ 00248 colCnt--; 00249 } 00250 00251 /* process odd column samples */ 00252 if((numColsA & 0x1u) > 0u) 00253 { 00254 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ 00255 sum += ((q31_t) * pInA * (*pInB++)); 00256 } 00257 00258 /* Saturate and store the result in the destination buffer */ 00259 *px = (q15_t) (sum >> 15); 00260 px++; 00261 00262 /* Decrement the column loop counter */ 00263 col--; 00264 00265 } while(col > 0u); 00266 00267 i = i + numColsA; 00268 00269 /* Decrement the row loop counter */ 00270 row--; 00271 00272 } while(row > 0u); 00273 00274 /* set status as ARM_MATH_SUCCESS */ 00275 status = ARM_MATH_SUCCESS; 00276 } 00277 00278 /* Return to application */ 00279 return (status); 00280 } 00281