F030C8xx_KBus.git

Merge remote-tracking branch 'origin/FP0' into kNet

QuakeGod

2023-02-01 6126f6a78b14297cefb02f06ba58806767d424b5

提交 \| 用户 \| age
bfc108	1	/* ----------------------------------------------------------------------
Q	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	3	*
	4	* $Date: 19. March 2015
	5	* $Revision: V.1.4.5
	6	*
	7	* Project: CMSIS DSP Library
	8	* Title: arm_conv_q7.c
	9	*
	10	* Description: Convolution of Q7 sequences.
	11	*
	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* - Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* - Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in
	21	* the documentation and/or other materials provided with the
	22	* distribution.
	23	* - Neither the name of ARM LIMITED nor the names of its contributors
	24	* may be used to endorse or promote products derived from this
	25	* software without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	38	* POSSIBILITY OF SUCH DAMAGE.
	39	* -------------------------------------------------------------------- */
	40
	41	#include "arm_math.h"
	42
	43	/**
	44	* @ingroup groupFilters
	45	*/
	46
	47	/**
	48	* @addtogroup Conv
	49	* @{
	50	*/
	51
	52	/**
	53	* @brief Convolution of Q7 sequences.
	54	* @param[in] *pSrcA points to the first input sequence.
	55	* @param[in] srcALen length of the first input sequence.
	56	* @param[in] *pSrcB points to the second input sequence.
	57	* @param[in] srcBLen length of the second input sequence.
	58	* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
	59	* @return none.
	60	*
	61	* @details
	62	* <b>Scaling and Overflow Behavior:</b>
	63	*
	64	* \par
	65	* The function is implemented using a 32-bit internal accumulator.
	66	* Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
	67	* The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
	68	* This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
	69	* The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
	70	*
	71	* \par
	72	* Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
	73	*
	74	*/
	75
	76	void arm_conv_q7(
	77	q7_t * pSrcA,
	78	uint32_t srcALen,
	79	q7_t * pSrcB,
	80	uint32_t srcBLen,
	81	q7_t * pDst)
	82	{
	83
	84
	85	#ifndef ARM_MATH_CM0_FAMILY
	86
	87	/* Run the below code for Cortex-M4 and Cortex-M3 */
	88
	89	q7_t pIn1; / inputA pointer */
	90	q7_t pIn2; / inputB pointer */
	91	q7_t pOut = pDst; / output pointer */
	92	q7_t px; / Intermediate inputA pointer */
	93	q7_t py; / Intermediate inputB pointer */
	94	q7_t pSrc1, pSrc2; /* Intermediate pointers */
	95	q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
	96	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
	97	q31_t input1, input2; /* Temporary input variables */
	98	q15_t in1, in2; /* Temporary input variables */
	99	uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
	100
	101	/* The algorithm implementation is based on the lengths of the inputs. */
	102	/* srcB is always made to slide across srcA. */
	103	/* So srcBLen is always considered as shorter or equal to srcALen */
	104	if(srcALen >= srcBLen)
	105	{
	106	/* Initialization of inputA pointer */
	107	pIn1 = pSrcA;
	108
	109	/* Initialization of inputB pointer */
	110	pIn2 = pSrcB;
	111	}
	112	else
	113	{
	114	/* Initialization of inputA pointer */
	115	pIn1 = pSrcB;
	116
	117	/* Initialization of inputB pointer */
	118	pIn2 = pSrcA;
	119
	120	/* srcBLen is always considered as shorter or equal to srcALen */
	121	j = srcBLen;
	122	srcBLen = srcALen;
	123	srcALen = j;
	124	}
	125
	126	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
	127	/* The function is internally
	128	* divided into three stages according to the number of multiplications that has to be
	129	* taken place between inputA samples and inputB samples. In the first stage of the
	130	* algorithm, the multiplications increase by one for every iteration.
	131	* In the second stage of the algorithm, srcBLen number of multiplications are done.
	132	* In the third stage of the algorithm, the multiplications decrease by one
	133	* for every iteration. */
	134
	135	/* The algorithm is implemented in three stages.
	136	The loop counters of each stage is initiated here. */
	137	blockSize1 = srcBLen - 1u;
	138	blockSize2 = (srcALen - srcBLen) + 1u;
	139	blockSize3 = blockSize1;
	140
	141	/* --------------------------
	142	* Initializations of stage1
	143	* -------------------------*/
	144
	145	/* sum = x[0] * y[0]
	146	* sum = x[0] * y[1] + x[1] * y[0]
	147	* ....
	148	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
	149	*/
	150
	151	/* In this stage the MAC operations are increased by 1 for every iteration.
	152	The count variable holds the number of MAC operations performed */
	153	count = 1u;
	154
	155	/* Working pointer of inputA */
	156	px = pIn1;
	157
	158	/* Working pointer of inputB */
	159	py = pIn2;
	160
	161
	162	/* ------------------------
	163	* Stage1 process
	164	* ----------------------*/
	165
	166	/* The first stage starts here */
	167	while(blockSize1 > 0u)
	168	{
	169	/* Accumulator is made zero for every iteration */
	170	sum = 0;
	171
	172	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	173	k = count >> 2u;
	174
	175	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	176	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	177	while(k > 0u)
	178	{
	179	/* x[0] , x[1] */
	180	in1 = (q15_t) * px++;
	181	in2 = (q15_t) * px++;
	182	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	183
	184	/* y[srcBLen - 1] , y[srcBLen - 2] */
	185	in1 = (q15_t) * py--;
	186	in2 = (q15_t) * py--;
	187	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	188
	189	/* x[0] * y[srcBLen - 1] */
	190	/* x[1] * y[srcBLen - 2] */
	191	sum = __SMLAD(input1, input2, sum);
	192
	193	/* x[2] , x[3] */
	194	in1 = (q15_t) * px++;
	195	in2 = (q15_t) * px++;
	196	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	197
	198	/* y[srcBLen - 3] , y[srcBLen - 4] */
	199	in1 = (q15_t) * py--;
	200	in2 = (q15_t) * py--;
	201	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	202
	203	/* x[2] * y[srcBLen - 3] */
	204	/* x[3] * y[srcBLen - 4] */
	205	sum = __SMLAD(input1, input2, sum);
	206
	207	/* Decrement the loop counter */
	208	k--;
	209	}
	210
	211	/* If the count is not a multiple of 4, compute any remaining MACs here.
	212	** No loop unrolling is used. */
	213	k = count % 0x4u;
	214
	215	while(k > 0u)
	216	{
	217	/* Perform the multiply-accumulates */
	218	sum += ((q15_t) * px++ * *py--);
	219
	220	/* Decrement the loop counter */
	221	k--;
	222	}
	223
	224	/* Store the result in the accumulator in the destination buffer. */
	225	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
	226
	227	/* Update the inputA and inputB pointers for next MAC calculation */
	228	py = pIn2 + count;
	229	px = pIn1;
	230
	231	/* Increment the MAC count */
	232	count++;
	233
	234	/* Decrement the loop counter */
	235	blockSize1--;
	236	}
	237
	238	/* --------------------------
	239	* Initializations of stage2
	240	* ------------------------*/
	241
	242	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
	243	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
	244	* ....
	245	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
	246	*/
	247
	248	/* Working pointer of inputA */
	249	px = pIn1;
	250
	251	/* Working pointer of inputB */
	252	pSrc2 = pIn2 + (srcBLen - 1u);
	253	py = pSrc2;
	254
	255	/* count is index by which the pointer pIn1 to be incremented */
	256	count = 0u;
	257
	258	/* -------------------
	259	* Stage2 process
	260	* ------------------*/
	261
	262	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
	263	* So, to loop unroll over blockSize2,
	264	* srcBLen should be greater than or equal to 4 */
	265	if(srcBLen >= 4u)
	266	{
	267	/* Loop unroll over blockSize2, by 4 */
	268	blkCnt = blockSize2 >> 2u;
	269
	270	while(blkCnt > 0u)
	271	{
	272	/* Set all accumulators to zero */
	273	acc0 = 0;
	274	acc1 = 0;
	275	acc2 = 0;
	276	acc3 = 0;
	277
	278	/* read x[0], x[1], x[2] samples */
	279	x0 = *(px++);
	280	x1 = *(px++);
	281	x2 = *(px++);
	282
	283	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	284	k = srcBLen >> 2u;
	285
	286	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	287	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	288	do
	289	{
	290	/* Read y[srcBLen - 1] sample */
	291	c0 = *(py--);
	292	/* Read y[srcBLen - 2] sample */
	293	c1 = *(py--);
	294
	295	/* Read x[3] sample */
	296	x3 = *(px++);
	297
	298	/* x[0] and x[1] are packed */
	299	in1 = (q15_t) x0;
	300	in2 = (q15_t) x1;
	301
	302	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	303
	304	/* y[srcBLen - 1] and y[srcBLen - 2] are packed */
	305	in1 = (q15_t) c0;
	306	in2 = (q15_t) c1;
	307
	308	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	309
	310	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
	311	acc0 = __SMLAD(input1, input2, acc0);
	312
	313	/* x[1] and x[2] are packed */
	314	in1 = (q15_t) x1;
	315	in2 = (q15_t) x2;
	316
	317	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	318
	319	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
	320	acc1 = __SMLAD(input1, input2, acc1);
	321
	322	/* x[2] and x[3] are packed */
	323	in1 = (q15_t) x2;
	324	in2 = (q15_t) x3;
	325
	326	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	327
	328	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
	329	acc2 = __SMLAD(input1, input2, acc2);
	330
	331	/* Read x[4] sample */
	332	x0 = *(px++);
	333
	334	/* x[3] and x[4] are packed */
	335	in1 = (q15_t) x3;
	336	in2 = (q15_t) x0;
	337
	338	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	339
	340	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
	341	acc3 = __SMLAD(input1, input2, acc3);
	342
	343	/* Read y[srcBLen - 3] sample */
	344	c0 = *(py--);
	345	/* Read y[srcBLen - 4] sample */
	346	c1 = *(py--);
	347
	348	/* Read x[5] sample */
	349	x1 = *(px++);
	350
	351	/* x[2] and x[3] are packed */
	352	in1 = (q15_t) x2;
	353	in2 = (q15_t) x3;
	354
	355	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	356
	357	/* y[srcBLen - 3] and y[srcBLen - 4] are packed */
	358	in1 = (q15_t) c0;
	359	in2 = (q15_t) c1;
	360
	361	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	362
	363	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
	364	acc0 = __SMLAD(input1, input2, acc0);
	365
	366	/* x[3] and x[4] are packed */
	367	in1 = (q15_t) x3;
	368	in2 = (q15_t) x0;
	369
	370	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	371
	372	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
	373	acc1 = __SMLAD(input1, input2, acc1);
	374
	375	/* x[4] and x[5] are packed */
	376	in1 = (q15_t) x0;
	377	in2 = (q15_t) x1;
	378
	379	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	380
	381	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
	382	acc2 = __SMLAD(input1, input2, acc2);
	383
	384	/* Read x[6] sample */
	385	x2 = *(px++);
	386
	387	/* x[5] and x[6] are packed */
	388	in1 = (q15_t) x1;
	389	in2 = (q15_t) x2;
	390
	391	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	392
	393	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
	394	acc3 = __SMLAD(input1, input2, acc3);
	395
	396	} while(--k);
	397
	398	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
	399	** No loop unrolling is used. */
	400	k = srcBLen % 0x4u;
	401
	402	while(k > 0u)
	403	{
	404	/* Read y[srcBLen - 5] sample */
	405	c0 = *(py--);
	406
	407	/* Read x[7] sample */
	408	x3 = *(px++);
	409
	410	/* Perform the multiply-accumulates */
	411	/* acc0 += x[4] * y[srcBLen - 5] */
	412	acc0 += ((q15_t) x0 * c0);
	413	/* acc1 += x[5] * y[srcBLen - 5] */
	414	acc1 += ((q15_t) x1 * c0);
	415	/* acc2 += x[6] * y[srcBLen - 5] */
	416	acc2 += ((q15_t) x2 * c0);
	417	/* acc3 += x[7] * y[srcBLen - 5] */
	418	acc3 += ((q15_t) x3 * c0);
	419
	420	/* Reuse the present samples for the next MAC */
	421	x0 = x1;
	422	x1 = x2;
	423	x2 = x3;
	424
	425	/* Decrement the loop counter */
	426	k--;
	427	}
	428
	429
	430	/* Store the result in the accumulator in the destination buffer. */
	431	*pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
	432	*pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
	433	*pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
	434	*pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
	435
	436	/* Increment the pointer pIn1 index, count by 4 */
	437	count += 4u;
	438
	439	/* Update the inputA and inputB pointers for next MAC calculation */
	440	px = pIn1 + count;
	441	py = pSrc2;
	442
	443	/* Decrement the loop counter */
	444	blkCnt--;
	445	}
	446
	447	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
	448	** No loop unrolling is used. */
	449	blkCnt = blockSize2 % 0x4u;
	450
	451	while(blkCnt > 0u)
	452	{
	453	/* Accumulator is made zero for every iteration */
	454	sum = 0;
	455
	456	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	457	k = srcBLen >> 2u;
	458
	459	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	460	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	461	while(k > 0u)
	462	{
	463
	464	/* Reading two inputs of SrcA buffer and packing */
	465	in1 = (q15_t) * px++;
	466	in2 = (q15_t) * px++;
	467	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	468
	469	/* Reading two inputs of SrcB buffer and packing */
	470	in1 = (q15_t) * py--;
	471	in2 = (q15_t) * py--;
	472	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	473
	474	/* Perform the multiply-accumulates */
	475	sum = __SMLAD(input1, input2, sum);
	476
	477	/* Reading two inputs of SrcA buffer and packing */
	478	in1 = (q15_t) * px++;
	479	in2 = (q15_t) * px++;
	480	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	481
	482	/* Reading two inputs of SrcB buffer and packing */
	483	in1 = (q15_t) * py--;
	484	in2 = (q15_t) * py--;
	485	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	486
	487	/* Perform the multiply-accumulates */
	488	sum = __SMLAD(input1, input2, sum);
	489
	490	/* Decrement the loop counter */
	491	k--;
	492	}
	493
	494	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
	495	** No loop unrolling is used. */
	496	k = srcBLen % 0x4u;
	497
	498	while(k > 0u)
	499	{
	500	/* Perform the multiply-accumulates */
	501	sum += ((q15_t) * px++ * *py--);
	502
	503	/* Decrement the loop counter */
	504	k--;
	505	}
	506
	507	/* Store the result in the accumulator in the destination buffer. */
	508	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
	509
	510	/* Increment the pointer pIn1 index, count by 1 */
	511	count++;
	512
	513	/* Update the inputA and inputB pointers for next MAC calculation */
	514	px = pIn1 + count;
	515	py = pSrc2;
	516
	517	/* Decrement the loop counter */
	518	blkCnt--;
	519	}
	520	}
	521	else
	522	{
	523	/* If the srcBLen is not a multiple of 4,
	524	* the blockSize2 loop cannot be unrolled by 4 */
	525	blkCnt = blockSize2;
	526
	527	while(blkCnt > 0u)
	528	{
	529	/* Accumulator is made zero for every iteration */
	530	sum = 0;
	531
	532	/* srcBLen number of MACS should be performed */
	533	k = srcBLen;
	534
	535	while(k > 0u)
	536	{
	537	/* Perform the multiply-accumulate */
	538	sum += ((q15_t) * px++ * *py--);
	539
	540	/* Decrement the loop counter */
	541	k--;
	542	}
	543
	544	/* Store the result in the accumulator in the destination buffer. */
	545	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
	546
	547	/* Increment the MAC count */
	548	count++;
	549
	550	/* Update the inputA and inputB pointers for next MAC calculation */
	551	px = pIn1 + count;
	552	py = pSrc2;
	553
	554	/* Decrement the loop counter */
	555	blkCnt--;
	556	}
	557	}
	558
	559
	560	/* --------------------------
	561	* Initializations of stage3
	562	* -------------------------*/
	563
	564	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
	565	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
	566	* ....
	567	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
	568	* sum += x[srcALen-1] * y[srcBLen-1]
	569	*/
	570
	571	/* In this stage the MAC operations are decreased by 1 for every iteration.
	572	The blockSize3 variable holds the number of MAC operations performed */
	573
	574	/* Working pointer of inputA */
	575	pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
	576	px = pSrc1;
	577
	578	/* Working pointer of inputB */
	579	pSrc2 = pIn2 + (srcBLen - 1u);
	580	py = pSrc2;
	581
	582	/* -------------------
	583	* Stage3 process
	584	* ------------------*/
	585
	586	while(blockSize3 > 0u)
	587	{
	588	/* Accumulator is made zero for every iteration */
	589	sum = 0;
	590
	591	/* Apply loop unrolling and compute 4 MACs simultaneously. */
	592	k = blockSize3 >> 2u;
	593
	594	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
	595	** a second loop below computes MACs for the remaining 1 to 3 samples. */
	596	while(k > 0u)
	597	{
	598	/* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
	599	in1 = (q15_t) * px++;
	600	in2 = (q15_t) * px++;
	601	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	602
	603	/* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
	604	in1 = (q15_t) * py--;
	605	in2 = (q15_t) * py--;
	606	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	607
	608	/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
	609	/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
	610	sum = __SMLAD(input1, input2, sum);
	611
	612	/* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
	613	in1 = (q15_t) * px++;
	614	in2 = (q15_t) * px++;
	615	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	616
	617	/* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
	618	in1 = (q15_t) * py--;
	619	in2 = (q15_t) * py--;
	620	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16u);
	621
	622	/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
	623	/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
	624	sum = __SMLAD(input1, input2, sum);
	625
	626	/* Decrement the loop counter */
	627	k--;
	628	}
	629
	630	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
	631	** No loop unrolling is used. */
	632	k = blockSize3 % 0x4u;
	633
	634	while(k > 0u)
	635	{
	636	/* Perform the multiply-accumulates */
	637	sum += ((q15_t) * px++ * *py--);
	638
	639	/* Decrement the loop counter */
	640	k--;
	641	}
	642
	643	/* Store the result in the accumulator in the destination buffer. */
	644	*pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
	645
	646	/* Update the inputA and inputB pointers for next MAC calculation */
	647	px = ++pSrc1;
	648	py = pSrc2;
	649
	650	/* Decrement the loop counter */
	651	blockSize3--;
	652	}
	653
	654	#else
	655
	656	/* Run the below code for Cortex-M0 */
	657
	658	q7_t pIn1 = pSrcA; / input pointer */
	659	q7_t pIn2 = pSrcB; / coefficient pointer */
	660	q31_t sum; /* Accumulator */
	661	uint32_t i, j; /* loop counter */
	662
	663	/* Loop to calculate output of convolution for output length number of times */
	664	for (i = 0; i < (srcALen + srcBLen - 1); i++)
	665	{
	666	/* Initialize sum with zero to carry on MAC operations */
	667	sum = 0;
	668
	669	/* Loop to perform MAC operations according to convolution equation */
	670	for (j = 0; j <= i; j++)
	671	{
	672	/* Check the array limitations */
	673	if(((i - j) < srcBLen) && (j < srcALen))
	674	{
	675	/* z[i] += x[i-j] * y[j] */
	676	sum += (q15_t) pIn1[j] * (pIn2[i - j]);
	677	}
	678	}
	679
	680	/* Store the output in the destination buffer */
	681	pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
	682	}
	683
	684	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
	685
	686	}
	687
	688	/**
	689	* @} end of Conv group
	690	*/