F030C8xx_KBus.git

Merge remote-tracking branch 'origin/FP0' into kNet

QuakeGod

2023-02-01 6126f6a78b14297cefb02f06ba58806767d424b5

提交 \| 用户 \| age
bfc108	1	/* ----------------------------------------------------------------------
Q	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	3	*
	4	* $Date: 19. March 2015
	5	* $Revision: V.1.4.5
	6	*
	7	* Project: CMSIS DSP Library
	8	* Title: arm_fir_q15.c
	9	*
	10	* Description: Q15 FIR filter processing function.
	11	*
	12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* - Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* - Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in
	21	* the documentation and/or other materials provided with the
	22	* distribution.
	23	* - Neither the name of ARM LIMITED nor the names of its contributors
	24	* may be used to endorse or promote products derived from this
	25	* software without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	38	* POSSIBILITY OF SUCH DAMAGE.
	39	* -------------------------------------------------------------------- */
	40
	41	#include "arm_math.h"
	42
	43	/**
	44	* @ingroup groupFilters
	45	*/
	46
	47	/**
	48	* @addtogroup FIR
	49	* @{
	50	*/
	51
	52	/**
	53	* @brief Processing function for the Q15 FIR filter.
	54	* @param[in] *S points to an instance of the Q15 FIR structure.
	55	* @param[in] *pSrc points to the block of input data.
	56	* @param[out] *pDst points to the block of output data.
	57	* @param[in] blockSize number of samples to process per call.
	58	* @return none.
	59	*
	60	*
	61	* \par Restrictions
	62	* If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
	63	* In this case input, output, state buffers should be aligned by 32-bit
	64	*
	65	* <b>Scaling and Overflow Behavior:</b>
	66	* \par
	67	* The function is implemented using a 64-bit internal accumulator.
	68	* Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
	69	* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
	70	* There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
	71	* After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
	72	* Lastly, the accumulator is saturated to yield a result in 1.15 format.
	73	*
	74	* \par
	75	* Refer to the function <code>arm_fir_fast_q15()</code> for a faster but less precise implementation of this function.
	76	*/
	77
	78	#ifndef ARM_MATH_CM0_FAMILY
	79
	80	/* Run the below code for Cortex-M4 and Cortex-M3 */
	81
	82	#ifndef UNALIGNED_SUPPORT_DISABLE
	83
	84
	85	void arm_fir_q15(
	86	const arm_fir_instance_q15 * S,
	87	q15_t * pSrc,
	88	q15_t * pDst,
	89	uint32_t blockSize)
	90	{
	91	q15_t pState = S->pState; / State pointer */
	92	q15_t pCoeffs = S->pCoeffs; / Coefficient pointer */
	93	q15_t pStateCurnt; / Points to the current sample of the state */
	94	q15_t px1; / Temporary q15 pointer for state buffer */
	95	q15_t pb; / Temporary pointer for coefficient buffer */
	96	q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */
	97	q63_t acc0, acc1, acc2, acc3; /* Accumulators */
	98	uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
	99	uint32_t tapCnt, blkCnt; /* Loop counters */
	100
	101
	102	/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
	103	/* pStateCurnt points to the location where the new input data should be written */
	104	pStateCurnt = &(S->pState[(numTaps - 1u)]);
	105
	106	/* Apply loop unrolling and compute 4 output values simultaneously.
	107	* The variables acc0 ... acc3 hold output values that are being computed:
	108	*
	109	* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
	110	* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
	111	* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
	112	* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
	113	*/
	114
	115	blkCnt = blockSize >> 2;
	116
	117	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
	118	** a second loop below computes the remaining 1 to 3 samples. */
	119	while(blkCnt > 0u)
	120	{
	121	/* Copy four new input samples into the state buffer.
	122	** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
	123	__SIMD32(pStateCurnt)++ = __SIMD32(pSrc)++;
	124	__SIMD32(pStateCurnt)++ = __SIMD32(pSrc)++;
	125
	126	/* Set all accumulators to zero */
	127	acc0 = 0;
	128	acc1 = 0;
	129	acc2 = 0;
	130	acc3 = 0;
	131
	132	/* Initialize state pointer of type q15 */
	133	px1 = pState;
	134
	135	/* Initialize coeff pointer of type q31 */
	136	pb = pCoeffs;
	137
	138	/* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
	139	x0 = _SIMD32_OFFSET(px1);
	140
	141	/* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
	142	x1 = _SIMD32_OFFSET(px1 + 1u);
	143
	144	px1 += 2u;
	145
	146	/* Loop over the number of taps. Unroll by a factor of 4.
	147	** Repeat until we've computed numTaps-4 coefficients. */
	148	tapCnt = numTaps >> 2;
	149
	150	while(tapCnt > 0u)
	151	{
	152	/* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
	153	c0 = *__SIMD32(pb)++;
	154
	155	/* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
	156	acc0 = __SMLALD(x0, c0, acc0);
	157
	158	/* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
	159	acc1 = __SMLALD(x1, c0, acc1);
	160
	161	/* Read state x[n-N-2], x[n-N-3] */
	162	x2 = _SIMD32_OFFSET(px1);
	163
	164	/* Read state x[n-N-3], x[n-N-4] */
	165	x3 = _SIMD32_OFFSET(px1 + 1u);
	166
	167	/* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
	168	acc2 = __SMLALD(x2, c0, acc2);
	169
	170	/* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
	171	acc3 = __SMLALD(x3, c0, acc3);
	172
	173	/* Read coefficients b[N-2], b[N-3] */
	174	c0 = *__SIMD32(pb)++;
	175
	176	/* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
	177	acc0 = __SMLALD(x2, c0, acc0);
	178
	179	/* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
	180	acc1 = __SMLALD(x3, c0, acc1);
	181
	182	/* Read state x[n-N-4], x[n-N-5] */
	183	x0 = _SIMD32_OFFSET(px1 + 2u);
	184
	185	/* Read state x[n-N-5], x[n-N-6] */
	186	x1 = _SIMD32_OFFSET(px1 + 3u);
	187
	188	/* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
	189	acc2 = __SMLALD(x0, c0, acc2);
	190
	191	/* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
	192	acc3 = __SMLALD(x1, c0, acc3);
	193
	194	px1 += 4u;
	195
	196	tapCnt--;
	197
	198	}
	199
	200
	201	/* If the filter length is not a multiple of 4, compute the remaining filter taps.
	202	** This is always be 2 taps since the filter length is even. */
	203	if((numTaps & 0x3u) != 0u)
	204	{
	205	/* Read 2 coefficients */
	206	c0 = *__SIMD32(pb)++;
	207
	208	/* Fetch 4 state variables */
	209	x2 = _SIMD32_OFFSET(px1);
	210
	211	x3 = _SIMD32_OFFSET(px1 + 1u);
	212
	213	/* Perform the multiply-accumulates */
	214	acc0 = __SMLALD(x0, c0, acc0);
	215
	216	px1 += 2u;
	217
	218	acc1 = __SMLALD(x1, c0, acc1);
	219	acc2 = __SMLALD(x2, c0, acc2);
	220	acc3 = __SMLALD(x3, c0, acc3);
	221	}
	222
	223	/* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
	224	** Then store the 4 outputs in the destination buffer. */
	225
	226	#ifndef ARM_MATH_BIG_ENDIAN
	227
	228	*__SIMD32(pDst)++ =
	229	__PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
	230	*__SIMD32(pDst)++ =
	231	__PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
	232
	233	#else
	234
	235	*__SIMD32(pDst)++ =
	236	__PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
	237	*__SIMD32(pDst)++ =
	238	__PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
	239
	240	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	241
	242
	243
	244	/* Advance the state pointer by 4 to process the next group of 4 samples */
	245	pState = pState + 4;
	246
	247	/* Decrement the loop counter */
	248	blkCnt--;
	249	}
	250
	251	/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
	252	** No loop unrolling is used. */
	253	blkCnt = blockSize % 0x4u;
	254	while(blkCnt > 0u)
	255	{
	256	/* Copy two samples into state buffer */
	257	pStateCurnt++ = pSrc++;
	258
	259	/* Set the accumulator to zero */
	260	acc0 = 0;
	261
	262	/* Initialize state pointer of type q15 */
	263	px1 = pState;
	264
	265	/* Initialize coeff pointer of type q31 */
	266	pb = pCoeffs;
	267
	268	tapCnt = numTaps >> 1;
	269
	270	do
	271	{
	272
	273	c0 = *__SIMD32(pb)++;
	274	x0 = *__SIMD32(px1)++;
	275
	276	acc0 = __SMLALD(x0, c0, acc0);
	277	tapCnt--;
	278	}
	279	while(tapCnt > 0u);
	280
	281	/* The result is in 2.30 format. Convert to 1.15 with saturation.
	282	** Then store the output in the destination buffer. */
	283	*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
	284
	285	/* Advance state pointer by 1 for the next sample */
	286	pState = pState + 1;
	287
	288	/* Decrement the loop counter */
	289	blkCnt--;
	290	}
	291
	292	/* Processing is complete.
	293	** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
	294	** This prepares the state buffer for the next function call. */
	295
	296	/* Points to the start of the state buffer */
	297	pStateCurnt = S->pState;
	298
	299	/* Calculation of count for copying integer writes */
	300	tapCnt = (numTaps - 1u) >> 2;
	301
	302	while(tapCnt > 0u)
	303	{
	304
	305	/* Copy state values to start of state buffer */
	306	__SIMD32(pStateCurnt)++ = __SIMD32(pState)++;
	307	__SIMD32(pStateCurnt)++ = __SIMD32(pState)++;
	308
	309	tapCnt--;
	310
	311	}
	312
	313	/* Calculation of count for remaining q15_t data */
	314	tapCnt = (numTaps - 1u) % 0x4u;
	315
	316	/* copy remaining data */
	317	while(tapCnt > 0u)
	318	{
	319	pStateCurnt++ = pState++;
	320
	321	/* Decrement the loop counter */
	322	tapCnt--;
	323	}
	324	}
	325
	326	#else /* UNALIGNED_SUPPORT_DISABLE */
	327
	328	void arm_fir_q15(
	329	const arm_fir_instance_q15 * S,
	330	q15_t * pSrc,
	331	q15_t * pDst,
	332	uint32_t blockSize)
	333	{
	334	q15_t pState = S->pState; / State pointer */
	335	q15_t pCoeffs = S->pCoeffs; / Coefficient pointer */
	336	q15_t pStateCurnt; / Points to the current sample of the state */
	337	q63_t acc0, acc1, acc2, acc3; /* Accumulators */
	338	q15_t pb; / Temporary pointer for coefficient buffer */
	339	q15_t px; / Temporary q31 pointer for SIMD state buffer accesses */
	340	q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */
	341	uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
	342	uint32_t tapCnt, blkCnt; /* Loop counters */
	343
	344
	345	/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
	346	/* pStateCurnt points to the location where the new input data should be written */
	347	pStateCurnt = &(S->pState[(numTaps - 1u)]);
	348
	349	/* Apply loop unrolling and compute 4 output values simultaneously.
	350	* The variables acc0 ... acc3 hold output values that are being computed:
	351	*
	352	* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
	353	* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
	354	* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
	355	* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
	356	*/
	357
	358	blkCnt = blockSize >> 2;
	359
	360	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
	361	** a second loop below computes the remaining 1 to 3 samples. */
	362	while(blkCnt > 0u)
	363	{
	364	/* Copy four new input samples into the state buffer.
	365	** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
	366	pStateCurnt++ = pSrc++;
	367	pStateCurnt++ = pSrc++;
	368	pStateCurnt++ = pSrc++;
	369	pStateCurnt++ = pSrc++;
	370
	371
	372	/* Set all accumulators to zero */
	373	acc0 = 0;
	374	acc1 = 0;
	375	acc2 = 0;
	376	acc3 = 0;
	377
	378	/* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
	379	px = pState;
	380
	381	/* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
	382	pb = pCoeffs;
	383
	384	/* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
	385	x0 = *__SIMD32(px)++;
	386
	387	/* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
	388	x2 = *__SIMD32(px)++;
	389
	390	/* Loop over the number of taps. Unroll by a factor of 4.
	391	** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
	392	tapCnt = numTaps >> 2;
	393
	394	while(tapCnt > 0)
	395	{
	396	/* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
	397	c0 = *__SIMD32(pb)++;
	398
	399	/* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
	400	acc0 = __SMLALD(x0, c0, acc0);
	401
	402	/* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
	403	acc2 = __SMLALD(x2, c0, acc2);
	404
	405	/* pack x[n-N-1] and x[n-N-2] */
	406	#ifndef ARM_MATH_BIG_ENDIAN
	407	x1 = __PKHBT(x2, x0, 0);
	408	#else
	409	x1 = __PKHBT(x0, x2, 0);
	410	#endif
	411
	412	/* Read state x[n-N-4], x[n-N-5] */
	413	x0 = _SIMD32_OFFSET(px);
	414
	415	/* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
	416	acc1 = __SMLALDX(x1, c0, acc1);
	417
	418	/* pack x[n-N-3] and x[n-N-4] */
	419	#ifndef ARM_MATH_BIG_ENDIAN
	420	x1 = __PKHBT(x0, x2, 0);
	421	#else
	422	x1 = __PKHBT(x2, x0, 0);
	423	#endif
	424
	425	/* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
	426	acc3 = __SMLALDX(x1, c0, acc3);
	427
	428	/* Read coefficients b[N-2], b[N-3] */
	429	c0 = *__SIMD32(pb)++;
	430
	431	/* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
	432	acc0 = __SMLALD(x2, c0, acc0);
	433
	434	/* Read state x[n-N-6], x[n-N-7] with offset */
	435	x2 = _SIMD32_OFFSET(px + 2u);
	436
	437	/* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
	438	acc2 = __SMLALD(x0, c0, acc2);
	439
	440	/* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
	441	acc1 = __SMLALDX(x1, c0, acc1);
	442
	443	/* pack x[n-N-5] and x[n-N-6] */
	444	#ifndef ARM_MATH_BIG_ENDIAN
	445	x1 = __PKHBT(x2, x0, 0);
	446	#else
	447	x1 = __PKHBT(x0, x2, 0);
	448	#endif
	449
	450	/* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
	451	acc3 = __SMLALDX(x1, c0, acc3);
	452
	453	/* Update state pointer for next state reading */
	454	px += 4u;
	455
	456	/* Decrement tap count */
	457	tapCnt--;
	458
	459	}
	460
	461	/* If the filter length is not a multiple of 4, compute the remaining filter taps.
	462	** This is always be 2 taps since the filter length is even. */
	463	if((numTaps & 0x3u) != 0u)
	464	{
	465
	466	/* Read last two coefficients */
	467	c0 = *__SIMD32(pb)++;
	468
	469	/* Perform the multiply-accumulates */
	470	acc0 = __SMLALD(x0, c0, acc0);
	471	acc2 = __SMLALD(x2, c0, acc2);
	472
	473	/* pack state variables */
	474	#ifndef ARM_MATH_BIG_ENDIAN
	475	x1 = __PKHBT(x2, x0, 0);
	476	#else
	477	x1 = __PKHBT(x0, x2, 0);
	478	#endif
	479
	480	/* Read last state variables */
	481	x0 = *__SIMD32(px);
	482
	483	/* Perform the multiply-accumulates */
	484	acc1 = __SMLALDX(x1, c0, acc1);
	485
	486	/* pack state variables */
	487	#ifndef ARM_MATH_BIG_ENDIAN
	488	x1 = __PKHBT(x0, x2, 0);
	489	#else
	490	x1 = __PKHBT(x2, x0, 0);
	491	#endif
	492
	493	/* Perform the multiply-accumulates */
	494	acc3 = __SMLALDX(x1, c0, acc3);
	495	}
	496
	497	/* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
	498	** Then store the 4 outputs in the destination buffer. */
	499
	500	#ifndef ARM_MATH_BIG_ENDIAN
	501
	502	*__SIMD32(pDst)++ =
	503	__PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
	504
	505	*__SIMD32(pDst)++ =
	506	__PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
	507
	508	#else
	509
	510	*__SIMD32(pDst)++ =
	511	__PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
	512
	513	*__SIMD32(pDst)++ =
	514	__PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
	515
	516	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	517
	518	/* Advance the state pointer by 4 to process the next group of 4 samples */
	519	pState = pState + 4;
	520
	521	/* Decrement the loop counter */
	522	blkCnt--;
	523	}
	524
	525	/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
	526	** No loop unrolling is used. */
	527	blkCnt = blockSize % 0x4u;
	528	while(blkCnt > 0u)
	529	{
	530	/* Copy two samples into state buffer */
	531	pStateCurnt++ = pSrc++;
	532
	533	/* Set the accumulator to zero */
	534	acc0 = 0;
	535
	536	/* Use SIMD to hold states and coefficients */
	537	px = pState;
	538	pb = pCoeffs;
	539
	540	tapCnt = numTaps >> 1u;
	541
	542	do
	543	{
	544	acc0 += (q31_t) * px++ * *pb++;
	545	acc0 += (q31_t) * px++ * *pb++;
	546	tapCnt--;
	547	}
	548	while(tapCnt > 0u);
	549
	550	/* The result is in 2.30 format. Convert to 1.15 with saturation.
	551	** Then store the output in the destination buffer. */
	552	*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
	553
	554	/* Advance state pointer by 1 for the next sample */
	555	pState = pState + 1u;
	556
	557	/* Decrement the loop counter */
	558	blkCnt--;
	559	}
	560
	561	/* Processing is complete.
	562	** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
	563	** This prepares the state buffer for the next function call. */
	564
	565	/* Points to the start of the state buffer */
	566	pStateCurnt = S->pState;
	567
	568	/* Calculation of count for copying integer writes */
	569	tapCnt = (numTaps - 1u) >> 2;
	570
	571	while(tapCnt > 0u)
	572	{
	573	pStateCurnt++ = pState++;
	574	pStateCurnt++ = pState++;
	575	pStateCurnt++ = pState++;
	576	pStateCurnt++ = pState++;
	577
	578	tapCnt--;
	579
	580	}
	581
	582	/* Calculation of count for remaining q15_t data */
	583	tapCnt = (numTaps - 1u) % 0x4u;
	584
	585	/* copy remaining data */
	586	while(tapCnt > 0u)
	587	{
	588	pStateCurnt++ = pState++;
	589
	590	/* Decrement the loop counter */
	591	tapCnt--;
	592	}
	593	}
	594
	595
	596	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
	597
	598	#else /* ARM_MATH_CM0_FAMILY */
	599
	600
	601	/* Run the below code for Cortex-M0 */
	602
	603	void arm_fir_q15(
	604	const arm_fir_instance_q15 * S,
	605	q15_t * pSrc,
	606	q15_t * pDst,
	607	uint32_t blockSize)
	608	{
	609	q15_t pState = S->pState; / State pointer */
	610	q15_t pCoeffs = S->pCoeffs; / Coefficient pointer */
	611	q15_t pStateCurnt; / Points to the current sample of the state */
	612
	613
	614
	615	q15_t px; / Temporary pointer for state buffer */
	616	q15_t pb; / Temporary pointer for coefficient buffer */
	617	q63_t acc; /* Accumulator */
	618	uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */
	619	uint32_t tapCnt, blkCnt; /* Loop counters */
	620
	621	/* S->pState buffer contains previous frame (numTaps - 1) samples */
	622	/* pStateCurnt points to the location where the new input data should be written */
	623	pStateCurnt = &(S->pState[(numTaps - 1u)]);
	624
	625	/* Initialize blkCnt with blockSize */
	626	blkCnt = blockSize;
	627
	628	while(blkCnt > 0u)
	629	{
	630	/* Copy one sample at a time into state buffer */
	631	pStateCurnt++ = pSrc++;
	632
	633	/* Set the accumulator to zero */
	634	acc = 0;
	635
	636	/* Initialize state pointer */
	637	px = pState;
	638
	639	/* Initialize Coefficient pointer */
	640	pb = pCoeffs;
	641
	642	tapCnt = numTaps;
	643
	644	/* Perform the multiply-accumulates */
	645	do
	646	{
	647	/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
	648	acc += (q31_t) * px++ * *pb++;
	649	tapCnt--;
	650	} while(tapCnt > 0u);
	651
	652	/* The result is in 2.30 format. Convert to 1.15
	653	** Then store the output in the destination buffer. */
	654	*pDst++ = (q15_t) __SSAT((acc >> 15u), 16);
	655
	656	/* Advance state pointer by 1 for the next sample */
	657	pState = pState + 1;
	658
	659	/* Decrement the samples loop counter */
	660	blkCnt--;
	661	}
	662
	663	/* Processing is complete.
	664	** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
	665	** This prepares the state buffer for the next function call. */
	666
	667	/* Points to the start of the state buffer */
	668	pStateCurnt = S->pState;
	669
	670	/* Copy numTaps number of values */
	671	tapCnt = (numTaps - 1u);
	672
	673	/* copy data */
	674	while(tapCnt > 0u)
	675	{
	676	pStateCurnt++ = pState++;
	677
	678	/* Decrement the loop counter */
	679	tapCnt--;
	680	}
	681
	682	}
	683
	684	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
	685
	686
	687
	688
	689	/**
	690	* @} end of FIR group
	691	*/