F030C8xx_KBus.git

Merge remote-tracking branch 'origin/FP0' into kNet

QuakeGod

2023-02-01 6126f6a78b14297cefb02f06ba58806767d424b5

提交 \| 用户 \| age
bfc108	1	/* ----------------------------------------------------------------------
Q	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	3	*
	4	* $Date: 19. March 2015
	5	* $Revision: V.1.4.5
	6	*
	7	* Project: CMSIS DSP Library
	8	* Title: arm_conv_q15.c
	9	*
	10	* Description: Convolution of Q15 sequences.
	11	*
	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* - Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* - Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in
	21	* the documentation and/or other materials provided with the
	22	* distribution.
	23	* - Neither the name of ARM LIMITED nor the names of its contributors
	24	* may be used to endorse or promote products derived from this
	25	* software without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	38	* POSSIBILITY OF SUCH DAMAGE.
	39	* -------------------------------------------------------------------- */
	40
	41	#include "arm_math.h"
	42
	43	/**
	44	* @ingroup groupFilters
	45	*/
	46
	47	/**
	48	* @addtogroup Conv
	49	* @{
	50	*/
	51
	52	/**
	53	* @brief Convolution of Q15 sequences.
	54	* @param[in] *pSrcA points to the first input sequence.
	55	* @param[in] srcALen length of the first input sequence.
	56	* @param[in] *pSrcB points to the second input sequence.
	57	* @param[in] srcBLen length of the second input sequence.
	58	* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
	59	* @return none.
	60	*
	61	* @details
	62	* <b>Scaling and Overflow Behavior:</b>
	63	*
	64	* \par
	65	* The function is implemented using a 64-bit internal accumulator.
	66	* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
	67	* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
	68	* This approach provides 33 guard bits and there is no risk of overflow.
	69	* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
	70	*
	71	* \par
	72	* Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
	73	*
	74	* \par
	75	* Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers.
	76	*
	77	*/
	78
	79	void arm_conv_q15(
	80	q15_t * pSrcA,
	81	uint32_t srcALen,
	82	q15_t * pSrcB,
	83	uint32_t srcBLen,
	84	q15_t * pDst)
	85	{
	86
	87	#if (defined(ARM_MATH_CM4) \|\| defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
	88
	89	/* Run the below code for Cortex-M4 and Cortex-M3 */
	90
	91	q15_t pIn1; / inputA pointer */
	92	q15_t pIn2; / inputB pointer */
	93	q15_t pOut = pDst; / output pointer */
	94	q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
	95	q15_t px; / Intermediate inputA pointer */
	96	q15_t py; / Intermediate inputB pointer */
	97	q15_t pSrc1, pSrc2; /* Intermediate pointers */
	98	q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
	99	uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
	100
	101	/* The algorithm implementation is based on the lengths of the inputs. */
	102	/* srcB is always made to slide across srcA. */
	103	/* So srcBLen is always considered as shorter or equal to srcALen */
	104	if(srcALen >= srcBLen)
	105	{
	106	/* Initialization of inputA pointer */
	107	pIn1 = pSrcA;
	108
	109	/* Initialization of inputB pointer */
	110	pIn2 = pSrcB;
	111	}
	112	else
	113	{
	114	/* Initialization of inputA pointer */
	115	pIn1 = pSrcB;
	116
	117	/* Initialization of inputB pointer */
	118	pIn2 = pSrcA;
	119
	120	/* srcBLen is always considered as shorter or equal to srcALen */
	121	j = srcBLen;
	122	srcBLen = srcALen;
	123	srcALen = j;
	124	}
	125
	126	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
	127	/* The function is internally
	128	* divided into three stages according to the number of multiplications that has to be
	129	* taken place between inputA samples and inputB samples. In the first stage of the
	130	* algorithm, the multiplications increase by one for every iteration.
	131	* In the second stage of the algorithm, srcBLen number of multiplications are done.
	132	* In the third stage of the algorithm, the multiplications decrease by one
	133	* for every iteration. */
	134
	135	/* The algorithm is implemented in three stages.
	136	The loop counters of each stage is initiated here. */
	137	blockSize1 = srcBLen - 1u;
	138	blockSize2 = srcALen - (srcBLen - 1u);
	139
	140	/* --------------------------
	141	* Initializations of stage1
	142	* -------------------------*/
	143
	144	/* sum = x[0] * y[0]
	145	* sum = x[0] * y[1] + x[1] * y[0]
	146	* ....
	147	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
	148	*/
	149
	150	/* In this stage the MAC operations are increased by 1 for every iteration.
	151	The count variable holds the number of MAC operations performed */
	152	count = 1u;
	153
	154	/* Working pointer of inputA */
	155	px = pIn1;
	156
	157	/* Working pointer of inputB */
	158	py = pIn2;
	159
	160
	161	/* ------------------------
	162	* Stage1 process
	163	* ----------------------*/
	164
	165	/* For loop unrolling by 4, this stage is divided into two. */
	166	/* First part of this stage computes the MAC operations less than 4 */
	167	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
	168
	169	/* The first part of the stage starts here */
	170	while((count < 4u) && (blockSize1 > 0u))
	171	{
	172	/* Accumulator is made zero for every iteration */
	173	sum = 0;
	174
	175	/* Loop over number of MAC operations between
	176	* inputA samples and inputB samples */
	177	k = count;
	178
	179	while(k > 0u)
	180	{
	181	/* Perform the multiply-accumulates */
	182	sum = __SMLALD(px++, py--, sum);
	183
	184	/* Decrement the loop counter */
	185	k--;
	186	}
	187
	188	/* Store the result in the accumulator in the destination buffer. */
	189	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
	190
	191	/* Update the inputA and inputB pointers for next MAC calculation */
	192	py = pIn2 + count;
	193	px = pIn1;
	194
	195	/* Increment the MAC count */
	196	count++;
	197
	198	/* Decrement the loop counter */
	199	blockSize1--;
	200	}
	201
	202	/* The second part of the stage starts here */
	203	/* The internal loop, over count, is unrolled by 4 */
	204	/* To, read the last two inputB samples using SIMD:
	205	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
	206	py = py - 1;
	207
	208	while(blockSize1 > 0u)
	209	{
	210	/* Accumulator is made zero for every iteration */
	211	sum = 0;
	212
	213	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	214	k = count >> 2u;
	215
	216	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	217	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	218	while(k > 0u)
	219	{
	220	/* Perform the multiply-accumulates */
	221	/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
	222	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
	223	/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
	224	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
	225
	226	/* Decrement the loop counter */
	227	k--;
	228	}
	229
	230	/* For the next MAC operations, the pointer py is used without SIMD
	231	* So, py is incremented by 1 */
	232	py = py + 1u;
	233
	234	/* If the count is not a multiple of 4, compute any remaining MACs here.
	235	** No loop unrolling is used. */
	236	k = count % 0x4u;
	237
	238	while(k > 0u)
	239	{
	240	/* Perform the multiply-accumulates */
	241	sum = __SMLALD(px++, py--, sum);
	242
	243	/* Decrement the loop counter */
	244	k--;
	245	}
	246
	247	/* Store the result in the accumulator in the destination buffer. */
	248	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
	249
	250	/* Update the inputA and inputB pointers for next MAC calculation */
	251	py = pIn2 + (count - 1u);
	252	px = pIn1;
	253
	254	/* Increment the MAC count */
	255	count++;
	256
	257	/* Decrement the loop counter */
	258	blockSize1--;
	259	}
	260
	261	/* --------------------------
	262	* Initializations of stage2
	263	* ------------------------*/
	264
	265	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
	266	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
	267	* ....
	268	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
	269	*/
	270
	271	/* Working pointer of inputA */
	272	px = pIn1;
	273
	274	/* Working pointer of inputB */
	275	pSrc2 = pIn2 + (srcBLen - 1u);
	276	py = pSrc2;
	277
	278	/* count is the index by which the pointer pIn1 to be incremented */
	279	count = 0u;
	280
	281
	282	/* --------------------
	283	* Stage2 process
	284	* -------------------*/
	285
	286	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
	287	* So, to loop unroll over blockSize2,
	288	* srcBLen should be greater than or equal to 4 */
	289	if(srcBLen >= 4u)
	290	{
	291	/* Loop unroll over blockSize2, by 4 */
	292	blkCnt = blockSize2 >> 2u;
	293
	294	while(blkCnt > 0u)
	295	{
	296	py = py - 1u;
	297
	298	/* Set all accumulators to zero */
	299	acc0 = 0;
	300	acc1 = 0;
	301	acc2 = 0;
	302	acc3 = 0;
	303
	304
	305	/* read x[0], x[1] samples */
	306	x0 = *__SIMD32(px);
	307	/* read x[1], x[2] samples */
	308	x1 = _SIMD32_OFFSET(px+1);
	309	px+= 2u;
	310
	311
	312	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	313	k = srcBLen >> 2u;
	314
	315	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	316	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	317	do
	318	{
	319	/* Read the last two inputB samples using SIMD:
	320	* y[srcBLen - 1] and y[srcBLen - 2] */
	321	c0 = *__SIMD32(py)--;
	322
	323	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
	324	acc0 = __SMLALDX(x0, c0, acc0);
	325
	326	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
	327	acc1 = __SMLALDX(x1, c0, acc1);
	328
	329	/* Read x[2], x[3] */
	330	x2 = *__SIMD32(px);
	331
	332	/* Read x[3], x[4] */
	333	x3 = _SIMD32_OFFSET(px+1);
	334
	335	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
	336	acc2 = __SMLALDX(x2, c0, acc2);
	337
	338	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
	339	acc3 = __SMLALDX(x3, c0, acc3);
	340
	341	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
	342	c0 = *__SIMD32(py)--;
	343
	344	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
	345	acc0 = __SMLALDX(x2, c0, acc0);
	346
	347	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
	348	acc1 = __SMLALDX(x3, c0, acc1);
	349
	350	/* Read x[4], x[5] */
	351	x0 = _SIMD32_OFFSET(px+2);
	352
	353	/* Read x[5], x[6] */
	354	x1 = _SIMD32_OFFSET(px+3);
	355	px += 4u;
	356
	357	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
	358	acc2 = __SMLALDX(x0, c0, acc2);
	359
	360	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
	361	acc3 = __SMLALDX(x1, c0, acc3);
	362
	363	} while(--k);
	364
	365	/* For the next MAC operations, SIMD is not used
	366	* So, the 16 bit pointer if inputB, py is updated */
	367
	368	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
	369	** No loop unrolling is used. */
	370	k = srcBLen % 0x4u;
	371
	372	if(k == 1u)
	373	{
	374	/* Read y[srcBLen - 5] */
	375	c0 = *(py+1);
	376
	377	#ifdef ARM_MATH_BIG_ENDIAN
	378
	379	c0 = c0 << 16u;
	380
	381	#else
	382
	383	c0 = c0 & 0x0000FFFF;
	384
	385	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
	386	/* Read x[7] */
	387	x3 = *__SIMD32(px);
	388	px++;
	389
	390	/* Perform the multiply-accumulates */
	391	acc0 = __SMLALD(x0, c0, acc0);
	392	acc1 = __SMLALD(x1, c0, acc1);
	393	acc2 = __SMLALDX(x1, c0, acc2);
	394	acc3 = __SMLALDX(x3, c0, acc3);
	395	}
	396
	397	if(k == 2u)
	398	{
	399	/* Read y[srcBLen - 5], y[srcBLen - 6] */
	400	c0 = _SIMD32_OFFSET(py);
	401
	402	/* Read x[7], x[8] */
	403	x3 = *__SIMD32(px);
	404
	405	/* Read x[9] */
	406	x2 = _SIMD32_OFFSET(px+1);
	407	px += 2u;
	408
	409	/* Perform the multiply-accumulates */
	410	acc0 = __SMLALDX(x0, c0, acc0);
	411	acc1 = __SMLALDX(x1, c0, acc1);
	412	acc2 = __SMLALDX(x3, c0, acc2);
	413	acc3 = __SMLALDX(x2, c0, acc3);
	414	}
	415
	416	if(k == 3u)
	417	{
	418	/* Read y[srcBLen - 5], y[srcBLen - 6] */
	419	c0 = _SIMD32_OFFSET(py);
	420
	421	/* Read x[7], x[8] */
	422	x3 = *__SIMD32(px);
	423
	424	/* Read x[9] */
	425	x2 = _SIMD32_OFFSET(px+1);
	426
	427	/* Perform the multiply-accumulates */
	428	acc0 = __SMLALDX(x0, c0, acc0);
	429	acc1 = __SMLALDX(x1, c0, acc1);
	430	acc2 = __SMLALDX(x3, c0, acc2);
	431	acc3 = __SMLALDX(x2, c0, acc3);
	432
	433	c0 = *(py-1);
	434
	435	#ifdef ARM_MATH_BIG_ENDIAN
	436
	437	c0 = c0 << 16u;
	438	#else
	439
	440	c0 = c0 & 0x0000FFFF;
	441	#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
	442	/* Read x[10] */
	443	x3 = _SIMD32_OFFSET(px+2);
	444	px += 3u;
	445
	446	/* Perform the multiply-accumulates */
	447	acc0 = __SMLALDX(x1, c0, acc0);
	448	acc1 = __SMLALD(x2, c0, acc1);
	449	acc2 = __SMLALDX(x2, c0, acc2);
	450	acc3 = __SMLALDX(x3, c0, acc3);
	451	}
	452
	453
	454	/* Store the results in the accumulators in the destination buffer. */
	455
	456	#ifndef ARM_MATH_BIG_ENDIAN
	457
	458	*__SIMD32(pOut)++ =
	459	__PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
	460	*__SIMD32(pOut)++ =
	461	__PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
	462
	463	#else
	464
	465	*__SIMD32(pOut)++ =
	466	__PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
	467	*__SIMD32(pOut)++ =
	468	__PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
	469
	470	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	471
	472	/* Increment the pointer pIn1 index, count by 4 */
	473	count += 4u;
	474
	475	/* Update the inputA and inputB pointers for next MAC calculation */
	476	px = pIn1 + count;
	477	py = pSrc2;
	478
	479	/* Decrement the loop counter */
	480	blkCnt--;
	481	}
	482
	483	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
	484	** No loop unrolling is used. */
	485	blkCnt = blockSize2 % 0x4u;
	486
	487	while(blkCnt > 0u)
	488	{
	489	/* Accumulator is made zero for every iteration */
	490	sum = 0;
	491
	492	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	493	k = srcBLen >> 2u;
	494
	495	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	496	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	497	while(k > 0u)
	498	{
	499	/* Perform the multiply-accumulates */
	500	sum += (q63_t) ((q31_t) * px++ * *py--);
	501	sum += (q63_t) ((q31_t) * px++ * *py--);
	502	sum += (q63_t) ((q31_t) * px++ * *py--);
	503	sum += (q63_t) ((q31_t) * px++ * *py--);
	504
	505	/* Decrement the loop counter */
	506	k--;
	507	}
	508
	509	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
	510	** No loop unrolling is used. */
	511	k = srcBLen % 0x4u;
	512
	513	while(k > 0u)
	514	{
	515	/* Perform the multiply-accumulates */
	516	sum += (q63_t) ((q31_t) * px++ * *py--);
	517
	518	/* Decrement the loop counter */
	519	k--;
	520	}
	521
	522	/* Store the result in the accumulator in the destination buffer. */
	523	*pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
	524
	525	/* Increment the pointer pIn1 index, count by 1 */
	526	count++;
	527
	528	/* Update the inputA and inputB pointers for next MAC calculation */
	529	px = pIn1 + count;
	530	py = pSrc2;
	531
	532	/* Decrement the loop counter */
	533	blkCnt--;
	534	}
	535	}
	536	else
	537	{
	538	/* If the srcBLen is not a multiple of 4,
	539	* the blockSize2 loop cannot be unrolled by 4 */
	540	blkCnt = blockSize2;
	541
	542	while(blkCnt > 0u)
	543	{
	544	/* Accumulator is made zero for every iteration */
	545	sum = 0;
	546
	547	/* srcBLen number of MACS should be performed */
	548	k = srcBLen;
	549
	550	while(k > 0u)
	551	{
	552	/* Perform the multiply-accumulate */
	553	sum += (q63_t) ((q31_t) * px++ * *py--);
	554
	555	/* Decrement the loop counter */
	556	k--;
	557	}
	558
	559	/* Store the result in the accumulator in the destination buffer. */
	560	*pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
	561
	562	/* Increment the MAC count */
	563	count++;
	564
	565	/* Update the inputA and inputB pointers for next MAC calculation */
	566	px = pIn1 + count;
	567	py = pSrc2;
	568
	569	/* Decrement the loop counter */
	570	blkCnt--;
	571	}
	572	}
	573
	574
	575	/* --------------------------
	576	* Initializations of stage3
	577	* -------------------------*/
	578
	579	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
	580	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
	581	* ....
	582	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
	583	* sum += x[srcALen-1] * y[srcBLen-1]
	584	*/
	585
	586	/* In this stage the MAC operations are decreased by 1 for every iteration.
	587	The blockSize3 variable holds the number of MAC operations performed */
	588
	589	blockSize3 = srcBLen - 1u;
	590
	591	/* Working pointer of inputA */
	592	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
	593	px = pSrc1;
	594
	595	/* Working pointer of inputB */
	596	pSrc2 = pIn2 + (srcBLen - 1u);
	597	pIn2 = pSrc2 - 1u;
	598	py = pIn2;
	599
	600	/* -------------------
	601	* Stage3 process
	602	* ------------------*/
	603
	604	/* For loop unrolling by 4, this stage is divided into two. */
	605	/* First part of this stage computes the MAC operations greater than 4 */
	606	/* Second part of this stage computes the MAC operations less than or equal to 4 */
	607
	608	/* The first part of the stage starts here */
	609	j = blockSize3 >> 2u;
	610
	611	while((j > 0u) && (blockSize3 > 0u))
	612	{
	613	/* Accumulator is made zero for every iteration */
	614	sum = 0;
	615
	616	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	617	k = blockSize3 >> 2u;
	618
	619	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	620	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	621	while(k > 0u)
	622	{
	623	/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
	624	* with y[srcBLen - 1], y[srcBLen - 2] respectively */
	625	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
	626	/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
	627	* with y[srcBLen - 3], y[srcBLen - 4] respectively */
	628	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
	629
	630	/* Decrement the loop counter */
	631	k--;
	632	}
	633
	634	/* For the next MAC operations, the pointer py is used without SIMD
	635	* So, py is incremented by 1 */
	636	py = py + 1u;
	637
	638	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
	639	** No loop unrolling is used. */
	640	k = blockSize3 % 0x4u;
	641
	642	while(k > 0u)
	643	{
	644	/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
	645	sum = __SMLALD(px++, py--, sum);
	646
	647	/* Decrement the loop counter */
	648	k--;
	649	}
	650
	651	/* Store the result in the accumulator in the destination buffer. */
	652	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
	653
	654	/* Update the inputA and inputB pointers for next MAC calculation */
	655	px = ++pSrc1;
	656	py = pIn2;
	657
	658	/* Decrement the loop counter */
	659	blockSize3--;
	660
	661	j--;
	662	}
	663
	664	/* The second part of the stage starts here */
	665	/* SIMD is not used for the next MAC operations,
	666	* so pointer py is updated to read only one sample at a time */
	667	py = py + 1u;
	668
	669	while(blockSize3 > 0u)
	670	{
	671	/* Accumulator is made zero for every iteration */
	672	sum = 0;
	673
	674	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	675	k = blockSize3;
	676
	677	while(k > 0u)
	678	{
	679	/* Perform the multiply-accumulates */
	680	/* sum += x[srcALen-1] * y[srcBLen-1] */
	681	sum = __SMLALD(px++, py--, sum);
	682
	683	/* Decrement the loop counter */
	684	k--;
	685	}
	686
	687	/* Store the result in the accumulator in the destination buffer. */
	688	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
	689
	690	/* Update the inputA and inputB pointers for next MAC calculation */
	691	px = ++pSrc1;
	692	py = pSrc2;
	693
	694	/* Decrement the loop counter */
	695	blockSize3--;
	696	}
	697
	698	#else
	699
	700	/* Run the below code for Cortex-M0 */
	701
	702	q15_t pIn1 = pSrcA; / input pointer */
	703	q15_t pIn2 = pSrcB; / coefficient pointer */
	704	q63_t sum; /* Accumulator */
	705	uint32_t i, j; /* loop counter */
	706
	707	/* Loop to calculate output of convolution for output length number of times */
	708	for (i = 0; i < (srcALen + srcBLen - 1); i++)
	709	{
	710	/* Initialize sum with zero to carry on MAC operations */
	711	sum = 0;
	712
	713	/* Loop to perform MAC operations according to convolution equation */
	714	for (j = 0; j <= i; j++)
	715	{
	716	/* Check the array limitations */
	717	if(((i - j) < srcBLen) && (j < srcALen))
	718	{
	719	/* z[i] += x[i-j] * y[j] */
	720	sum += (q31_t) pIn1[j] * (pIn2[i - j]);
	721	}
	722	}
	723
	724	/* Store the output in the destination buffer */
	725	pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
	726	}
	727
	728	#endif /* #if (defined(ARM_MATH_CM4) \|\| defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/
	729
	730	}
	731
	732	/**
	733	* @} end of Conv group
	734	*/