F030C8xx_KBus.git

Merge remote-tracking branch 'origin/FP0' into kNet

QuakeGod

2023-02-01 6126f6a78b14297cefb02f06ba58806767d424b5

提交 \| 用户 \| age
bfc108	1	/* ----------------------------------------------------------------------
Q	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	3	*
	4	* $Date: 19. March 2015
	5	* $Revision: V.1.4.5
	6	*
	7	* Project: CMSIS DSP Library
	8	* Title: arm_fir_f32.c
	9	*
	10	* Description: Floating-point FIR filter processing function.
	11	*
	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* - Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* - Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in
	21	* the documentation and/or other materials provided with the
	22	* distribution.
	23	* - Neither the name of ARM LIMITED nor the names of its contributors
	24	* may be used to endorse or promote products derived from this
	25	* software without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	38	* POSSIBILITY OF SUCH DAMAGE.
	39	* -------------------------------------------------------------------- */
	40
	41	#include "arm_math.h"
	42
	43	/**
	44	* @ingroup groupFilters
	45	*/
	46
	47	/**
	48	* @defgroup FIR Finite Impulse Response (FIR) Filters
	49	*
	50	* This set of functions implements Finite Impulse Response (FIR) filters
	51	* for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided.
	52	* The functions operate on blocks of input and output data and each call to the function processes
	53	* <code>blockSize</code> samples through the filter. <code>pSrc</code> and
	54	* <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
	55	*
	56	* \par Algorithm:
	57	* The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
	58	* Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
	59	* <pre>
	60	* y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
	61	* </pre>
	62	* \par
	63	* \image html FIR.gif "Finite Impulse Response filter"
	64	* \par
	65	* <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
	66	* Coefficients are stored in time reversed order.
	67	* \par
	68	* <pre>
	69	* {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
	70	* </pre>
	71	* \par
	72	* <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
	73	* Samples in the state buffer are stored in the following order.
	74	* \par
	75	* <pre>
	76	* {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
	77	* </pre>
	78	* \par
	79	* Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
	80	* The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
	81	* to be avoided and yields a significant speed improvement.
	82	* The state variables are updated after each block of data is processed; the coefficients are untouched.
	83	* \par Instance Structure
	84	* The coefficients and state variables for a filter are stored together in an instance data structure.
	85	* A separate instance structure must be defined for each filter.
	86	* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
	87	* There are separate instance structure declarations for each of the 4 supported data types.
	88	*
	89	* \par Initialization Functions
	90	* There is also an associated initialization function for each data type.
	91	* The initialization function performs the following operations:
	92	* - Sets the values of the internal structure fields.
	93	* - Zeros out the values in the state buffer.
	94	* To do this manually without calling the init function, assign the follow subfields of the instance structure:
	95	* numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
	96	*
	97	* \par
	98	* Use of the initialization function is optional.
	99	* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
	100	* To place an instance structure into a const data section, the instance structure must be manually initialized.
	101	* Set the values in the state buffer to zeros before static initialization.
	102	* The code below statically initializes each of the 4 different data type filter instance structures
	103	* <pre>
	104	*arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
	105	*arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
	106	*arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
	107	*arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
	108	* </pre>
	109	*
	110	* where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
	111	* <code>pCoeffs</code> is the address of the coefficient buffer.
	112	*
	113	* \par Fixed-Point Behavior
	114	* Care must be taken when using the fixed-point versions of the FIR filter functions.
	115	* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
	116	* Refer to the function specific documentation below for usage guidelines.
	117	*/
	118
	119	/**
	120	* @addtogroup FIR
	121	* @{
	122	*/
	123
	124	/**
	125	*
	126	* @param[in] *S points to an instance of the floating-point FIR filter structure.
	127	* @param[in] *pSrc points to the block of input data.
	128	* @param[out] *pDst points to the block of output data.
	129	* @param[in] blockSize number of samples to process per call.
	130	* @return none.
	131	*
	132	*/
	133
	134	#if defined(ARM_MATH_CM7)
	135
	136	void arm_fir_f32(
	137	const arm_fir_instance_f32 * S,
	138	float32_t * pSrc,
	139	float32_t * pDst,
	140	uint32_t blockSize)
	141	{
	142	float32_t pState = S->pState; / State pointer */
	143	float32_t pCoeffs = S->pCoeffs; / Coefficient pointer */
	144	float32_t pStateCurnt; / Points to the current sample of the state */
	145	float32_t px, pb; /* Temporary pointers for state and coefficient buffers */
	146	float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
	147	float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */
	148	uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
	149	uint32_t i, tapCnt, blkCnt; /* Loop counters */
	150
	151	/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
	152	/* pStateCurnt points to the location where the new input data should be written */
	153	pStateCurnt = &(S->pState[(numTaps - 1u)]);
	154
	155	/* Apply loop unrolling and compute 8 output values simultaneously.
	156	* The variables acc0 ... acc7 hold output values that are being computed:
	157	*
	158	* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
	159	* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
	160	* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
	161	* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
	162	*/
	163	blkCnt = blockSize >> 3;
	164
	165	/* First part of the processing with loop unrolling. Compute 8 outputs at a time.
	166	** a second loop below computes the remaining 1 to 7 samples. */
	167	while(blkCnt > 0u)
	168	{
	169	/* Copy four new input samples into the state buffer */
	170	pStateCurnt++ = pSrc++;
	171	pStateCurnt++ = pSrc++;
	172	pStateCurnt++ = pSrc++;
	173	pStateCurnt++ = pSrc++;
	174
	175	/* Set all accumulators to zero */
	176	acc0 = 0.0f;
	177	acc1 = 0.0f;
	178	acc2 = 0.0f;
	179	acc3 = 0.0f;
	180	acc4 = 0.0f;
	181	acc5 = 0.0f;
	182	acc6 = 0.0f;
	183	acc7 = 0.0f;
	184
	185	/* Initialize state pointer */
	186	px = pState;
	187
	188	/* Initialize coeff pointer */
	189	pb = (pCoeffs);
	190
	191	/* This is separated from the others to avoid
	192	* a call to __aeabi_memmove which would be slower
	193	*/
	194	pStateCurnt++ = pSrc++;
	195	pStateCurnt++ = pSrc++;
	196	pStateCurnt++ = pSrc++;
	197	pStateCurnt++ = pSrc++;
	198
	199	/* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
	200	x0 = *px++;
	201	x1 = *px++;
	202	x2 = *px++;
	203	x3 = *px++;
	204	x4 = *px++;
	205	x5 = *px++;
	206	x6 = *px++;
	207
	208	/* Loop unrolling. Process 8 taps at a time. */
	209	tapCnt = numTaps >> 3u;
	210
	211	/* Loop over the number of taps. Unroll by a factor of 8.
	212	** Repeat until we've computed numTaps-8 coefficients. */
	213	while(tapCnt > 0u)
	214	{
	215	/* Read the b[numTaps-1] coefficient */
	216	c0 = *(pb++);
	217
	218	/* Read x[n-numTaps-3] sample */
	219	x7 = *(px++);
	220
	221	/* acc0 += b[numTaps-1] * x[n-numTaps] */
	222	acc0 += x0 * c0;
	223
	224	/* acc1 += b[numTaps-1] * x[n-numTaps-1] */
	225	acc1 += x1 * c0;
	226
	227	/* acc2 += b[numTaps-1] * x[n-numTaps-2] */
	228	acc2 += x2 * c0;
	229
	230	/* acc3 += b[numTaps-1] * x[n-numTaps-3] */
	231	acc3 += x3 * c0;
	232
	233	/* acc4 += b[numTaps-1] * x[n-numTaps-4] */
	234	acc4 += x4 * c0;
	235
	236	/* acc1 += b[numTaps-1] * x[n-numTaps-5] */
	237	acc5 += x5 * c0;
	238
	239	/* acc2 += b[numTaps-1] * x[n-numTaps-6] */
	240	acc6 += x6 * c0;
	241
	242	/* acc3 += b[numTaps-1] * x[n-numTaps-7] */
	243	acc7 += x7 * c0;
	244
	245	/* Read the b[numTaps-2] coefficient */
	246	c0 = *(pb++);
	247
	248	/* Read x[n-numTaps-4] sample */
	249	x0 = *(px++);
	250
	251	/* Perform the multiply-accumulate */
	252	acc0 += x1 * c0;
	253	acc1 += x2 * c0;
	254	acc2 += x3 * c0;
	255	acc3 += x4 * c0;
	256	acc4 += x5 * c0;
	257	acc5 += x6 * c0;
	258	acc6 += x7 * c0;
	259	acc7 += x0 * c0;
	260
	261	/* Read the b[numTaps-3] coefficient */
	262	c0 = *(pb++);
	263
	264	/* Read x[n-numTaps-5] sample */
	265	x1 = *(px++);
	266
	267	/* Perform the multiply-accumulates */
	268	acc0 += x2 * c0;
	269	acc1 += x3 * c0;
	270	acc2 += x4 * c0;
	271	acc3 += x5 * c0;
	272	acc4 += x6 * c0;
	273	acc5 += x7 * c0;
	274	acc6 += x0 * c0;
	275	acc7 += x1 * c0;
	276
	277	/* Read the b[numTaps-4] coefficient */
	278	c0 = *(pb++);
	279
	280	/* Read x[n-numTaps-6] sample */
	281	x2 = *(px++);
	282
	283	/* Perform the multiply-accumulates */
	284	acc0 += x3 * c0;
	285	acc1 += x4 * c0;
	286	acc2 += x5 * c0;
	287	acc3 += x6 * c0;
	288	acc4 += x7 * c0;
	289	acc5 += x0 * c0;
	290	acc6 += x1 * c0;
	291	acc7 += x2 * c0;
	292
	293	/* Read the b[numTaps-4] coefficient */
	294	c0 = *(pb++);
	295
	296	/* Read x[n-numTaps-6] sample */
	297	x3 = *(px++);
	298	/* Perform the multiply-accumulates */
	299	acc0 += x4 * c0;
	300	acc1 += x5 * c0;
	301	acc2 += x6 * c0;
	302	acc3 += x7 * c0;
	303	acc4 += x0 * c0;
	304	acc5 += x1 * c0;
	305	acc6 += x2 * c0;
	306	acc7 += x3 * c0;
	307
	308	/* Read the b[numTaps-4] coefficient */
	309	c0 = *(pb++);
	310
	311	/* Read x[n-numTaps-6] sample */
	312	x4 = *(px++);
	313
	314	/* Perform the multiply-accumulates */
	315	acc0 += x5 * c0;
	316	acc1 += x6 * c0;
	317	acc2 += x7 * c0;
	318	acc3 += x0 * c0;
	319	acc4 += x1 * c0;
	320	acc5 += x2 * c0;
	321	acc6 += x3 * c0;
	322	acc7 += x4 * c0;
	323
	324	/* Read the b[numTaps-4] coefficient */
	325	c0 = *(pb++);
	326
	327	/* Read x[n-numTaps-6] sample */
	328	x5 = *(px++);
	329
	330	/* Perform the multiply-accumulates */
	331	acc0 += x6 * c0;
	332	acc1 += x7 * c0;
	333	acc2 += x0 * c0;
	334	acc3 += x1 * c0;
	335	acc4 += x2 * c0;
	336	acc5 += x3 * c0;
	337	acc6 += x4 * c0;
	338	acc7 += x5 * c0;
	339
	340	/* Read the b[numTaps-4] coefficient */
	341	c0 = *(pb++);
	342
	343	/* Read x[n-numTaps-6] sample */
	344	x6 = *(px++);
	345
	346	/* Perform the multiply-accumulates */
	347	acc0 += x7 * c0;
	348	acc1 += x0 * c0;
	349	acc2 += x1 * c0;
	350	acc3 += x2 * c0;
	351	acc4 += x3 * c0;
	352	acc5 += x4 * c0;
	353	acc6 += x5 * c0;
	354	acc7 += x6 * c0;
	355
	356	tapCnt--;
	357	}
	358
	359	/* If the filter length is not a multiple of 8, compute the remaining filter taps */
	360	tapCnt = numTaps % 0x8u;
	361
	362	while(tapCnt > 0u)
	363	{
	364	/* Read coefficients */
	365	c0 = *(pb++);
	366
	367	/* Fetch 1 state variable */
	368	x7 = *(px++);
	369
	370	/* Perform the multiply-accumulates */
	371	acc0 += x0 * c0;
	372	acc1 += x1 * c0;
	373	acc2 += x2 * c0;
	374	acc3 += x3 * c0;
	375	acc4 += x4 * c0;
	376	acc5 += x5 * c0;
	377	acc6 += x6 * c0;
	378	acc7 += x7 * c0;
	379
	380	/* Reuse the present sample states for next sample */
	381	x0 = x1;
	382	x1 = x2;
	383	x2 = x3;
	384	x3 = x4;
	385	x4 = x5;
	386	x5 = x6;
	387	x6 = x7;
	388
	389	/* Decrement the loop counter */
	390	tapCnt--;
	391	}
	392
	393	/* Advance the state pointer by 8 to process the next group of 8 samples */
	394	pState = pState + 8;
	395
	396	/* The results in the 8 accumulators, store in the destination buffer. */
	397	*pDst++ = acc0;
	398	*pDst++ = acc1;
	399	*pDst++ = acc2;
	400	*pDst++ = acc3;
	401	*pDst++ = acc4;
	402	*pDst++ = acc5;
	403	*pDst++ = acc6;
	404	*pDst++ = acc7;
	405
	406	blkCnt--;
	407	}
	408
	409	/* If the blockSize is not a multiple of 8, compute any remaining output samples here.
	410	** No loop unrolling is used. */
	411	blkCnt = blockSize % 0x8u;
	412
	413	while(blkCnt > 0u)
	414	{
	415	/* Copy one sample at a time into state buffer */
	416	pStateCurnt++ = pSrc++;
	417
	418	/* Set the accumulator to zero */
	419	acc0 = 0.0f;
	420
	421	/* Initialize state pointer */
	422	px = pState;
	423
	424	/* Initialize Coefficient pointer */
	425	pb = (pCoeffs);
	426
	427	i = numTaps;
	428
	429	/* Perform the multiply-accumulates */
	430	do
	431	{
	432	acc0 += px++ *pb++;
	433	i--;
	434
	435	} while(i > 0u);
	436
	437	/* The result is store in the destination buffer. */
	438	*pDst++ = acc0;
	439
	440	/* Advance state pointer by 1 for the next sample */
	441	pState = pState + 1;
	442
	443	blkCnt--;
	444	}
	445
	446	/* Processing is complete.
	447	** Now copy the last numTaps - 1 samples to the start of the state buffer.
	448	** This prepares the state buffer for the next function call. */
	449
	450	/* Points to the start of the state buffer */
	451	pStateCurnt = S->pState;
	452
	453	tapCnt = (numTaps - 1u) >> 2u;
	454
	455	/* copy data */
	456	while(tapCnt > 0u)
	457	{
	458	pStateCurnt++ = pState++;
	459	pStateCurnt++ = pState++;
	460	pStateCurnt++ = pState++;
	461	pStateCurnt++ = pState++;
	462
	463	/* Decrement the loop counter */
	464	tapCnt--;
	465	}
	466
	467	/* Calculate remaining number of copies */
	468	tapCnt = (numTaps - 1u) % 0x4u;
	469
	470	/* Copy the remaining q31_t data */
	471	while(tapCnt > 0u)
	472	{
	473	pStateCurnt++ = pState++;
	474
	475	/* Decrement the loop counter */
	476	tapCnt--;
	477	}
	478	}
	479
	480	#elif defined(ARM_MATH_CM0_FAMILY)
	481
	482	void arm_fir_f32(
	483	const arm_fir_instance_f32 * S,
	484	float32_t * pSrc,
	485	float32_t * pDst,
	486	uint32_t blockSize)
	487	{
	488	float32_t pState = S->pState; / State pointer */
	489	float32_t pCoeffs = S->pCoeffs; / Coefficient pointer */
	490	float32_t pStateCurnt; / Points to the current sample of the state */
	491	float32_t px, pb; /* Temporary pointers for state and coefficient buffers */
	492	uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
	493	uint32_t i, tapCnt, blkCnt; /* Loop counters */
	494
	495	/* Run the below code for Cortex-M0 */
	496
	497	float32_t acc;
	498
	499	/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
	500	/* pStateCurnt points to the location where the new input data should be written */
	501	pStateCurnt = &(S->pState[(numTaps - 1u)]);
	502
	503	/* Initialize blkCnt with blockSize */
	504	blkCnt = blockSize;
	505
	506	while(blkCnt > 0u)
	507	{
	508	/* Copy one sample at a time into state buffer */
	509	pStateCurnt++ = pSrc++;
	510
	511	/* Set the accumulator to zero */
	512	acc = 0.0f;
	513
	514	/* Initialize state pointer */
	515	px = pState;
	516
	517	/* Initialize Coefficient pointer */
	518	pb = pCoeffs;
	519
	520	i = numTaps;
	521
	522	/* Perform the multiply-accumulates */
	523	do
	524	{
	525	/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
	526	acc += px++ *pb++;
	527	i--;
	528
	529	} while(i > 0u);
	530
	531	/* The result is store in the destination buffer. */
	532	*pDst++ = acc;
	533
	534	/* Advance state pointer by 1 for the next sample */
	535	pState = pState + 1;
	536
	537	blkCnt--;
	538	}
	539
	540	/* Processing is complete.
	541	** Now copy the last numTaps - 1 samples to the starting of the state buffer.
	542	** This prepares the state buffer for the next function call. */
	543
	544	/* Points to the start of the state buffer */
	545	pStateCurnt = S->pState;
	546
	547	/* Copy numTaps number of values */
	548	tapCnt = numTaps - 1u;
	549
	550	/* Copy data */
	551	while(tapCnt > 0u)
	552	{
	553	pStateCurnt++ = pState++;
	554
	555	/* Decrement the loop counter */
	556	tapCnt--;
	557	}
	558
	559	}
	560
	561	#else
	562
	563	/* Run the below code for Cortex-M4 and Cortex-M3 */
	564
	565	void arm_fir_f32(
	566	const arm_fir_instance_f32 * S,
	567	float32_t * pSrc,
	568	float32_t * pDst,
	569	uint32_t blockSize)
	570	{
	571	float32_t pState = S->pState; / State pointer */
	572	float32_t pCoeffs = S->pCoeffs; / Coefficient pointer */
	573	float32_t pStateCurnt; / Points to the current sample of the state */
	574	float32_t px, pb; /* Temporary pointers for state and coefficient buffers */
	575	float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
	576	float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */
	577	uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
	578	uint32_t i, tapCnt, blkCnt; /* Loop counters */
	579	float32_t p0,p1,p2,p3,p4,p5,p6,p7; /* Temporary product values */
	580
	581	/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
	582	/* pStateCurnt points to the location where the new input data should be written */
	583	pStateCurnt = &(S->pState[(numTaps - 1u)]);
	584
	585	/* Apply loop unrolling and compute 8 output values simultaneously.
	586	* The variables acc0 ... acc7 hold output values that are being computed:
	587	*
	588	* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
	589	* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
	590	* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
	591	* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
	592	*/
	593	blkCnt = blockSize >> 3;
	594
	595	/* First part of the processing with loop unrolling. Compute 8 outputs at a time.
	596	** a second loop below computes the remaining 1 to 7 samples. */
	597	while(blkCnt > 0u)
	598	{
	599	/* Copy four new input samples into the state buffer */
	600	pStateCurnt++ = pSrc++;
	601	pStateCurnt++ = pSrc++;
	602	pStateCurnt++ = pSrc++;
	603	pStateCurnt++ = pSrc++;
	604
	605	/* Set all accumulators to zero */
	606	acc0 = 0.0f;
	607	acc1 = 0.0f;
	608	acc2 = 0.0f;
	609	acc3 = 0.0f;
	610	acc4 = 0.0f;
	611	acc5 = 0.0f;
	612	acc6 = 0.0f;
	613	acc7 = 0.0f;
	614
	615	/* Initialize state pointer */
	616	px = pState;
	617
	618	/* Initialize coeff pointer */
	619	pb = (pCoeffs);
	620
	621	/* This is separated from the others to avoid
	622	* a call to __aeabi_memmove which would be slower
	623	*/
	624	pStateCurnt++ = pSrc++;
	625	pStateCurnt++ = pSrc++;
	626	pStateCurnt++ = pSrc++;
	627	pStateCurnt++ = pSrc++;
	628
	629	/* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
	630	x0 = *px++;
	631	x1 = *px++;
	632	x2 = *px++;
	633	x3 = *px++;
	634	x4 = *px++;
	635	x5 = *px++;
	636	x6 = *px++;
	637
	638	/* Loop unrolling. Process 8 taps at a time. */
	639	tapCnt = numTaps >> 3u;
	640
	641	/* Loop over the number of taps. Unroll by a factor of 8.
	642	** Repeat until we've computed numTaps-8 coefficients. */
	643	while(tapCnt > 0u)
	644	{
	645	/* Read the b[numTaps-1] coefficient */
	646	c0 = *(pb++);
	647
	648	/* Read x[n-numTaps-3] sample */
	649	x7 = *(px++);
	650
	651	/* acc0 += b[numTaps-1] * x[n-numTaps] */
	652	p0 = x0 * c0;
	653
	654	/* acc1 += b[numTaps-1] * x[n-numTaps-1] */
	655	p1 = x1 * c0;
	656
	657	/* acc2 += b[numTaps-1] * x[n-numTaps-2] */
	658	p2 = x2 * c0;
	659
	660	/* acc3 += b[numTaps-1] * x[n-numTaps-3] */
	661	p3 = x3 * c0;
	662
	663	/* acc4 += b[numTaps-1] * x[n-numTaps-4] */
	664	p4 = x4 * c0;
	665
	666	/* acc1 += b[numTaps-1] * x[n-numTaps-5] */
	667	p5 = x5 * c0;
	668
	669	/* acc2 += b[numTaps-1] * x[n-numTaps-6] */
	670	p6 = x6 * c0;
	671
	672	/* acc3 += b[numTaps-1] * x[n-numTaps-7] */
	673	p7 = x7 * c0;
	674
	675	/* Read the b[numTaps-2] coefficient */
	676	c0 = *(pb++);
	677
	678	/* Read x[n-numTaps-4] sample */
	679	x0 = *(px++);
	680
	681	acc0 += p0;
	682	acc1 += p1;
	683	acc2 += p2;
	684	acc3 += p3;
	685	acc4 += p4;
	686	acc5 += p5;
	687	acc6 += p6;
	688	acc7 += p7;
	689
	690
	691	/* Perform the multiply-accumulate */
	692	p0 = x1 * c0;
	693	p1 = x2 * c0;
	694	p2 = x3 * c0;
	695	p3 = x4 * c0;
	696	p4 = x5 * c0;
	697	p5 = x6 * c0;
	698	p6 = x7 * c0;
	699	p7 = x0 * c0;
	700
	701	/* Read the b[numTaps-3] coefficient */
	702	c0 = *(pb++);
	703
	704	/* Read x[n-numTaps-5] sample */
	705	x1 = *(px++);
	706
	707	acc0 += p0;
	708	acc1 += p1;
	709	acc2 += p2;
	710	acc3 += p3;
	711	acc4 += p4;
	712	acc5 += p5;
	713	acc6 += p6;
	714	acc7 += p7;
	715
	716	/* Perform the multiply-accumulates */
	717	p0 = x2 * c0;
	718	p1 = x3 * c0;
	719	p2 = x4 * c0;
	720	p3 = x5 * c0;
	721	p4 = x6 * c0;
	722	p5 = x7 * c0;
	723	p6 = x0 * c0;
	724	p7 = x1 * c0;
	725
	726	/* Read the b[numTaps-4] coefficient */
	727	c0 = *(pb++);
	728
	729	/* Read x[n-numTaps-6] sample */
	730	x2 = *(px++);
	731
	732	acc0 += p0;
	733	acc1 += p1;
	734	acc2 += p2;
	735	acc3 += p3;
	736	acc4 += p4;
	737	acc5 += p5;
	738	acc6 += p6;
	739	acc7 += p7;
	740
	741	/* Perform the multiply-accumulates */
	742	p0 = x3 * c0;
	743	p1 = x4 * c0;
	744	p2 = x5 * c0;
	745	p3 = x6 * c0;
	746	p4 = x7 * c0;
	747	p5 = x0 * c0;
	748	p6 = x1 * c0;
	749	p7 = x2 * c0;
	750
	751	/* Read the b[numTaps-4] coefficient */
	752	c0 = *(pb++);
	753
	754	/* Read x[n-numTaps-6] sample */
	755	x3 = *(px++);
	756
	757	acc0 += p0;
	758	acc1 += p1;
	759	acc2 += p2;
	760	acc3 += p3;
	761	acc4 += p4;
	762	acc5 += p5;
	763	acc6 += p6;
	764	acc7 += p7;
	765
	766	/* Perform the multiply-accumulates */
	767	p0 = x4 * c0;
	768	p1 = x5 * c0;
	769	p2 = x6 * c0;
	770	p3 = x7 * c0;
	771	p4 = x0 * c0;
	772	p5 = x1 * c0;
	773	p6 = x2 * c0;
	774	p7 = x3 * c0;
	775
	776	/* Read the b[numTaps-4] coefficient */
	777	c0 = *(pb++);
	778
	779	/* Read x[n-numTaps-6] sample */
	780	x4 = *(px++);
	781
	782	acc0 += p0;
	783	acc1 += p1;
	784	acc2 += p2;
	785	acc3 += p3;
	786	acc4 += p4;
	787	acc5 += p5;
	788	acc6 += p6;
	789	acc7 += p7;
	790
	791	/* Perform the multiply-accumulates */
	792	p0 = x5 * c0;
	793	p1 = x6 * c0;
	794	p2 = x7 * c0;
	795	p3 = x0 * c0;
	796	p4 = x1 * c0;
	797	p5 = x2 * c0;
	798	p6 = x3 * c0;
	799	p7 = x4 * c0;
	800
	801	/* Read the b[numTaps-4] coefficient */
	802	c0 = *(pb++);
	803
	804	/* Read x[n-numTaps-6] sample */
	805	x5 = *(px++);
	806
	807	acc0 += p0;
	808	acc1 += p1;
	809	acc2 += p2;
	810	acc3 += p3;
	811	acc4 += p4;
	812	acc5 += p5;
	813	acc6 += p6;
	814	acc7 += p7;
	815
	816	/* Perform the multiply-accumulates */
	817	p0 = x6 * c0;
	818	p1 = x7 * c0;
	819	p2 = x0 * c0;
	820	p3 = x1 * c0;
	821	p4 = x2 * c0;
	822	p5 = x3 * c0;
	823	p6 = x4 * c0;
	824	p7 = x5 * c0;
	825
	826	/* Read the b[numTaps-4] coefficient */
	827	c0 = *(pb++);
	828
	829	/* Read x[n-numTaps-6] sample */
	830	x6 = *(px++);
	831
	832	acc0 += p0;
	833	acc1 += p1;
	834	acc2 += p2;
	835	acc3 += p3;
	836	acc4 += p4;
	837	acc5 += p5;
	838	acc6 += p6;
	839	acc7 += p7;
	840
	841	/* Perform the multiply-accumulates */
	842	p0 = x7 * c0;
	843	p1 = x0 * c0;
	844	p2 = x1 * c0;
	845	p3 = x2 * c0;
	846	p4 = x3 * c0;
	847	p5 = x4 * c0;
	848	p6 = x5 * c0;
	849	p7 = x6 * c0;
	850
	851	tapCnt--;
	852
	853	acc0 += p0;
	854	acc1 += p1;
	855	acc2 += p2;
	856	acc3 += p3;
	857	acc4 += p4;
	858	acc5 += p5;
	859	acc6 += p6;
	860	acc7 += p7;
	861	}
	862
	863	/* If the filter length is not a multiple of 8, compute the remaining filter taps */
	864	tapCnt = numTaps % 0x8u;
	865
	866	while(tapCnt > 0u)
	867	{
	868	/* Read coefficients */
	869	c0 = *(pb++);
	870
	871	/* Fetch 1 state variable */
	872	x7 = *(px++);
	873
	874	/* Perform the multiply-accumulates */
	875	p0 = x0 * c0;
	876	p1 = x1 * c0;
	877	p2 = x2 * c0;
	878	p3 = x3 * c0;
	879	p4 = x4 * c0;
	880	p5 = x5 * c0;
	881	p6 = x6 * c0;
	882	p7 = x7 * c0;
	883
	884	/* Reuse the present sample states for next sample */
	885	x0 = x1;
	886	x1 = x2;
	887	x2 = x3;
	888	x3 = x4;
	889	x4 = x5;
	890	x5 = x6;
	891	x6 = x7;
	892
	893	acc0 += p0;
	894	acc1 += p1;
	895	acc2 += p2;
	896	acc3 += p3;
	897	acc4 += p4;
	898	acc5 += p5;
	899	acc6 += p6;
	900	acc7 += p7;
	901
	902	/* Decrement the loop counter */
	903	tapCnt--;
	904	}
	905
	906	/* Advance the state pointer by 8 to process the next group of 8 samples */
	907	pState = pState + 8;
	908
	909	/* The results in the 8 accumulators, store in the destination buffer. */
	910	*pDst++ = acc0;
	911	*pDst++ = acc1;
	912	*pDst++ = acc2;
	913	*pDst++ = acc3;
	914	*pDst++ = acc4;
	915	*pDst++ = acc5;
	916	*pDst++ = acc6;
	917	*pDst++ = acc7;
	918
	919	blkCnt--;
	920	}
	921
	922	/* If the blockSize is not a multiple of 8, compute any remaining output samples here.
	923	** No loop unrolling is used. */
	924	blkCnt = blockSize % 0x8u;
	925
	926	while(blkCnt > 0u)
	927	{
	928	/* Copy one sample at a time into state buffer */
	929	pStateCurnt++ = pSrc++;
	930
	931	/* Set the accumulator to zero */
	932	acc0 = 0.0f;
	933
	934	/* Initialize state pointer */
	935	px = pState;
	936
	937	/* Initialize Coefficient pointer */
	938	pb = (pCoeffs);
	939
	940	i = numTaps;
	941
	942	/* Perform the multiply-accumulates */
	943	do
	944	{
	945	acc0 += px++ *pb++;
	946	i--;
	947
	948	} while(i > 0u);
	949
	950	/* The result is store in the destination buffer. */
	951	*pDst++ = acc0;
	952
	953	/* Advance state pointer by 1 for the next sample */
	954	pState = pState + 1;
	955
	956	blkCnt--;
	957	}
	958
	959	/* Processing is complete.
	960	** Now copy the last numTaps - 1 samples to the start of the state buffer.
	961	** This prepares the state buffer for the next function call. */
	962
	963	/* Points to the start of the state buffer */
	964	pStateCurnt = S->pState;
	965
	966	tapCnt = (numTaps - 1u) >> 2u;
	967
	968	/* copy data */
	969	while(tapCnt > 0u)
	970	{
	971	pStateCurnt++ = pState++;
	972	pStateCurnt++ = pState++;
	973	pStateCurnt++ = pState++;
	974	pStateCurnt++ = pState++;
	975
	976	/* Decrement the loop counter */
	977	tapCnt--;
	978	}
	979
	980	/* Calculate remaining number of copies */
	981	tapCnt = (numTaps - 1u) % 0x4u;
	982
	983	/* Copy the remaining q31_t data */
	984	while(tapCnt > 0u)
	985	{
	986	pStateCurnt++ = pState++;
	987
	988	/* Decrement the loop counter */
	989	tapCnt--;
	990	}
	991	}
	992
	993	#endif
	994
	995	/**
	996	* @} end of FIR group
	997	*/