F030C8xx_KBus.git

Merge remote-tracking branch 'origin/FP0' into kNet

QuakeGod

2023-02-01 6126f6a78b14297cefb02f06ba58806767d424b5

提交 \| 用户 \| age
bfc108	1	/* ----------------------------------------------------------------------
Q	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	3	*
	4	* $Date: 19. March 2015
	5	* $Revision: V.1.4.5
	6	*
	7	* Project: CMSIS DSP Library
	8	* Title: arm_cmplx_mat_mult_q15.c
	9	*
	10	* Description: Q15 complex matrix multiplication.
	11	*
	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* - Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* - Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in
	21	* the documentation and/or other materials provided with the
	22	* distribution.
	23	* - Neither the name of ARM LIMITED nor the names of its contributors
	24	* may be used to endorse or promote products derived from this
	25	* software without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	38	* POSSIBILITY OF SUCH DAMAGE.
	39	* -------------------------------------------------------------------- */
	40	#include "arm_math.h"
	41
	42	/**
	43	* @ingroup groupMatrix
	44	*/
	45
	46	/**
	47	* @addtogroup CmplxMatrixMult
	48	* @{
	49	*/
	50
	51
	52	/**
	53	* @brief Q15 Complex matrix multiplication
	54	* @param[in] *pSrcA points to the first input complex matrix structure
	55	* @param[in] *pSrcB points to the second input complex matrix structure
	56	* @param[out] *pDst points to output complex matrix structure
	57	* @param[in] *pScratch points to the array for storing intermediate results
	58	* @return The function returns either
	59	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	60	*
	61	* \par Conditions for optimum performance
	62	* Input, output and state buffers should be aligned by 32-bit
	63	*
	64	* \par Restrictions
	65	* If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
	66	* In this case input, output, scratch buffers should be aligned by 32-bit
	67	*
	68	* @details
	69	* <b>Scaling and Overflow Behavior:</b>
	70	*
	71	* \par
	72	* The function is implemented using a 64-bit internal accumulator. The inputs to the
	73	* multiplications are in 1.15 format and multiplications yield a 2.30 result.
	74	* The 2.30 intermediate
	75	* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
	76	* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
	77	* truncated to 34.15 format by discarding the low 15 bits and then saturated to
	78	* 1.15 format.
	79	*
	80	* \par
	81	* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
	82	*
	83	*/
	84
	85
	86
	87
	88	arm_status arm_mat_cmplx_mult_q15(
	89	const arm_matrix_instance_q15 * pSrcA,
	90	const arm_matrix_instance_q15 * pSrcB,
	91	arm_matrix_instance_q15 * pDst,
	92	q15_t * pScratch)
	93	{
	94	/* accumulator */
	95	q15_t pSrcBT = pScratch; / input data matrix pointer for transpose */
	96	q15_t pInA = pSrcA->pData; / input data matrix pointer A of Q15 type */
	97	q15_t pInB = pSrcB->pData; / input data matrix pointer B of Q15 type */
	98	q15_t px; / Temporary output data matrix pointer */
	99	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
	100	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
	101	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
	102	uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
	103	uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */
	104	arm_status status; /* status of matrix multiplication */
	105	q63_t sumReal, sumImag;
	106
	107	#ifdef UNALIGNED_SUPPORT_DISABLE
	108	q15_t in; /* Temporary variable to hold the input value */
	109	q15_t a, b, c, d;
	110	#else
	111	q31_t in; /* Temporary variable to hold the input value */
	112	q31_t prod1, prod2;
	113	q31_t pSourceA, pSourceB;
	114	#endif
	115
	116	#ifdef ARM_MATH_MATRIX_CHECK
	117	/* Check for matrix mismatch condition */
	118	if((pSrcA->numCols != pSrcB->numRows) \|\|
	119	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
	120	{
	121	/* Set status as ARM_MATH_SIZE_MISMATCH */
	122	status = ARM_MATH_SIZE_MISMATCH;
	123	}
	124	else
	125	#endif
	126	{
	127	/* Matrix transpose */
	128	do
	129	{
	130	/* Apply loop unrolling and exchange the columns with row elements */
	131	col = numColsB >> 2;
	132
	133	/* The pointer px is set to starting address of the column being processed */
	134	px = pSrcBT + i;
	135
	136	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
	137	** a second loop below computes the remaining 1 to 3 samples. */
	138	while(col > 0u)
	139	{
	140	#ifdef UNALIGNED_SUPPORT_DISABLE
	141	/* Read two elements from the row */
	142	in = *pInB++;
	143	*px = in;
	144	in = *pInB++;
	145	px[1] = in;
	146
	147	/* Update the pointer px to point to the next row of the transposed matrix */
	148	px += numRowsB * 2;
	149
	150	/* Read two elements from the row */
	151	in = *pInB++;
	152	*px = in;
	153	in = *pInB++;
	154	px[1] = in;
	155
	156	/* Update the pointer px to point to the next row of the transposed matrix */
	157	px += numRowsB * 2;
	158
	159	/* Read two elements from the row */
	160	in = *pInB++;
	161	*px = in;
	162	in = *pInB++;
	163	px[1] = in;
	164
	165	/* Update the pointer px to point to the next row of the transposed matrix */
	166	px += numRowsB * 2;
	167
	168	/* Read two elements from the row */
	169	in = *pInB++;
	170	*px = in;
	171	in = *pInB++;
	172	px[1] = in;
	173
	174	/* Update the pointer px to point to the next row of the transposed matrix */
	175	px += numRowsB * 2;
	176
	177	/* Decrement the column loop counter */
	178	col--;
	179	}
	180
	181	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
	182	** No loop unrolling is used. */
	183	col = numColsB % 0x4u;
	184
	185	while(col > 0u)
	186	{
	187	/* Read two elements from the row */
	188	in = *pInB++;
	189	*px = in;
	190	in = *pInB++;
	191	px[1] = in;
	192	#else
	193
	194	/* Read two elements from the row */
	195	in = *__SIMD32(pInB)++;
	196
	197	*__SIMD32(px) = in;
	198
	199	/* Update the pointer px to point to the next row of the transposed matrix */
	200	px += numRowsB * 2;
	201
	202
	203	/* Read two elements from the row */
	204	in = *__SIMD32(pInB)++;
	205
	206	*__SIMD32(px) = in;
	207
	208	/* Update the pointer px to point to the next row of the transposed matrix */
	209	px += numRowsB * 2;
	210
	211	/* Read two elements from the row */
	212	in = *__SIMD32(pInB)++;
	213
	214	*__SIMD32(px) = in;
	215
	216	/* Update the pointer px to point to the next row of the transposed matrix */
	217	px += numRowsB * 2;
	218
	219	/* Read two elements from the row */
	220	in = *__SIMD32(pInB)++;
	221
	222	*__SIMD32(px) = in;
	223
	224	/* Update the pointer px to point to the next row of the transposed matrix */
	225	px += numRowsB * 2;
	226
	227	/* Decrement the column loop counter */
	228	col--;
	229	}
	230
	231	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
	232	** No loop unrolling is used. */
	233	col = numColsB % 0x4u;
	234
	235	while(col > 0u)
	236	{
	237	/* Read two elements from the row */
	238	in = *__SIMD32(pInB)++;
	239
	240	*__SIMD32(px) = in;
	241	#endif
	242
	243	/* Update the pointer px to point to the next row of the transposed matrix */
	244	px += numRowsB * 2;
	245
	246	/* Decrement the column loop counter */
	247	col--;
	248	}
	249
	250	i = i + 2u;
	251
	252	/* Decrement the row loop counter */
	253	row--;
	254
	255	} while(row > 0u);
	256
	257	/* Reset the variables for the usage in the following multiplication process */
	258	row = numRowsA;
	259	i = 0u;
	260	px = pDst->pData;
	261
	262	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
	263	/* row loop */
	264	do
	265	{
	266	/* For every row wise process, the column loop counter is to be initiated */
	267	col = numColsB;
	268
	269	/* For every row wise process, the pIn2 pointer is set
	270	** to the starting address of the transposed pSrcB data */
	271	pInB = pSrcBT;
	272
	273	/* column loop */
	274	do
	275	{
	276	/* Set the variable sum, that acts as accumulator, to zero */
	277	sumReal = 0;
	278	sumImag = 0;
	279
	280	/* Apply loop unrolling and compute 2 MACs simultaneously. */
	281	colCnt = numColsA >> 1;
	282
	283	/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
	284	pInA = pSrcA->pData + i * 2;
	285
	286
	287	/* matrix multiplication */
	288	while(colCnt > 0u)
	289	{
	290	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
	291
	292	#ifdef UNALIGNED_SUPPORT_DISABLE
	293
	294	/* read real and imag values from pSrcA buffer */
	295	a = *pInA;
	296	b = *(pInA + 1u);
	297	/* read real and imag values from pSrcB buffer */
	298	c = *pInB;
	299	d = *(pInB + 1u);
	300
	301	/* Multiply and Accumlates */
	302	sumReal += (q31_t) a *c;
	303	sumImag += (q31_t) a *d;
	304	sumReal -= (q31_t) b *d;
	305	sumImag += (q31_t) b *c;
	306
	307	/* read next real and imag values from pSrcA buffer */
	308	a = *(pInA + 2u);
	309	b = *(pInA + 3u);
	310	/* read next real and imag values from pSrcB buffer */
	311	c = *(pInB + 2u);
	312	d = *(pInB + 3u);
	313
	314	/* update pointer */
	315	pInA += 4u;
	316
	317	/* Multiply and Accumlates */
	318	sumReal += (q31_t) a *c;
	319	sumImag += (q31_t) a *d;
	320	sumReal -= (q31_t) b *d;
	321	sumImag += (q31_t) b *c;
	322	/* update pointer */
	323	pInB += 4u;
	324	#else
	325	/* read real and imag values from pSrcA and pSrcB buffer */
	326	pSourceA = *__SIMD32(pInA)++;
	327	pSourceB = *__SIMD32(pInB)++;
	328
	329	/* Multiply and Accumlates */
	330	#ifdef ARM_MATH_BIG_ENDIAN
	331	prod1 = -__SMUSD(pSourceA, pSourceB);
	332	#else
	333	prod1 = __SMUSD(pSourceA, pSourceB);
	334	#endif
	335	prod2 = __SMUADX(pSourceA, pSourceB);
	336	sumReal += (q63_t) prod1;
	337	sumImag += (q63_t) prod2;
	338
	339	/* read real and imag values from pSrcA and pSrcB buffer */
	340	pSourceA = *__SIMD32(pInA)++;
	341	pSourceB = *__SIMD32(pInB)++;
	342
	343	/* Multiply and Accumlates */
	344	#ifdef ARM_MATH_BIG_ENDIAN
	345	prod1 = -__SMUSD(pSourceA, pSourceB);
	346	#else
	347	prod1 = __SMUSD(pSourceA, pSourceB);
	348	#endif
	349	prod2 = __SMUADX(pSourceA, pSourceB);
	350	sumReal += (q63_t) prod1;
	351	sumImag += (q63_t) prod2;
	352
	353	#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
	354
	355	/* Decrement the loop counter */
	356	colCnt--;
	357	}
	358
	359	/* process odd column samples */
	360	if((numColsA & 0x1u) > 0u)
	361	{
	362	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
	363
	364	#ifdef UNALIGNED_SUPPORT_DISABLE
	365
	366	/* read real and imag values from pSrcA and pSrcB buffer */
	367	a = *pInA++;
	368	b = *pInA++;
	369	c = *pInB++;
	370	d = *pInB++;
	371
	372	/* Multiply and Accumlates */
	373	sumReal += (q31_t) a *c;
	374	sumImag += (q31_t) a *d;
	375	sumReal -= (q31_t) b *d;
	376	sumImag += (q31_t) b *c;
	377
	378	#else
	379	/* read real and imag values from pSrcA and pSrcB buffer */
	380	pSourceA = *__SIMD32(pInA)++;
	381	pSourceB = *__SIMD32(pInB)++;
	382
	383	/* Multiply and Accumlates */
	384	#ifdef ARM_MATH_BIG_ENDIAN
	385	prod1 = -__SMUSD(pSourceA, pSourceB);
	386	#else
	387	prod1 = __SMUSD(pSourceA, pSourceB);
	388	#endif
	389	prod2 = __SMUADX(pSourceA, pSourceB);
	390	sumReal += (q63_t) prod1;
	391	sumImag += (q63_t) prod2;
	392
	393	#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
	394
	395	}
	396
	397	/* Saturate and store the result in the destination buffer */
	398
	399	*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
	400	*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
	401
	402	/* Decrement the column loop counter */
	403	col--;
	404
	405	} while(col > 0u);
	406
	407	i = i + numColsA;
	408
	409	/* Decrement the row loop counter */
	410	row--;
	411
	412	} while(row > 0u);
	413
	414	/* set status as ARM_MATH_SUCCESS */
	415	status = ARM_MATH_SUCCESS;
	416	}
	417
	418	/* Return to application */
	419	return (status);
	420	}
	421
	422	/**
	423	* @} end of MatrixMult group
	424	*/