F030C8T6_Kbus_MIX.git

QuakeGod

2024-02-24 8b51c78f1b88d94a89bb8c37ae38a54f523cb597

提交 \| 用户 \| age
8b51c7	1	/* ----------------------------------------------------------------------
Q	2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	3	*
	4	* $Date: 19. March 2015
	5	* $Revision: V.1.4.5
	6	*
	7	* Project: CMSIS DSP Library
	8	* Title: arm_cfft_radix4_q15.c
	9	*
	10	* Description: This file has function definition of Radix-4 FFT & IFFT function and
	11	* In-place bit reversal using bit reversal table
	12	*
	13	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	14	*
	15	* Redistribution and use in source and binary forms, with or without
	16	* modification, are permitted provided that the following conditions
	17	* are met:
	18	* - Redistributions of source code must retain the above copyright
	19	* notice, this list of conditions and the following disclaimer.
	20	* - Redistributions in binary form must reproduce the above copyright
	21	* notice, this list of conditions and the following disclaimer in
	22	* the documentation and/or other materials provided with the
	23	* distribution.
	24	* - Neither the name of ARM LIMITED nor the names of its contributors
	25	* may be used to endorse or promote products derived from this
	26	* software without specific prior written permission.
	27	*
	28	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	29	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	30	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	31	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	32	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	33	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	34	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	35	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	36	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	37	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	38	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	39	* POSSIBILITY OF SUCH DAMAGE.
	40	* -------------------------------------------------------------------- */
	41
	42	#include "arm_math.h"
	43
	44
	45	void arm_radix4_butterfly_q15(
	46	q15_t * pSrc16,
	47	uint32_t fftLen,
	48	q15_t * pCoef16,
	49	uint32_t twidCoefModifier);
	50
	51	void arm_radix4_butterfly_inverse_q15(
	52	q15_t * pSrc16,
	53	uint32_t fftLen,
	54	q15_t * pCoef16,
	55	uint32_t twidCoefModifier);
	56
	57	void arm_bitreversal_q15(
	58	q15_t * pSrc,
	59	uint32_t fftLen,
	60	uint16_t bitRevFactor,
	61	uint16_t * pBitRevTab);
	62
	63	/**
	64	* @ingroup groupTransforms
	65	*/
	66
	67	/**
	68	* @addtogroup ComplexFFT
	69	* @{
	70	*/
	71
	72
	73	/**
	74	* @details
	75	* @brief Processing function for the Q15 CFFT/CIFFT.
	76	* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
	77	* @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure.
	78	* @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
	79	* @return none.
	80	*
	81	* \par Input and output formats:
	82	* \par
	83	* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
	84	* Hence the output format is different for different FFT sizes.
	85	* The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
	86	* \par
	87	* \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
	88	* \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
	89	*/
	90
	91	void arm_cfft_radix4_q15(
	92	const arm_cfft_radix4_instance_q15 * S,
	93	q15_t * pSrc)
	94	{
	95	if(S->ifftFlag == 1u)
	96	{
	97	/* Complex IFFT radix-4 */
	98	arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle,
	99	S->twidCoefModifier);
	100	}
	101	else
	102	{
	103	/* Complex FFT radix-4 */
	104	arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle,
	105	S->twidCoefModifier);
	106	}
	107
	108	if(S->bitReverseFlag == 1u)
	109	{
	110	/* Bit Reversal */
	111	arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
	112	}
	113
	114	}
	115
	116	/**
	117	* @} end of ComplexFFT group
	118	*/
	119
	120	/*
	121	* Radix-4 FFT algorithm used is :
	122	*
	123	* Input real and imaginary data:
	124	* x(n) = xa + j * ya
	125	* x(n+N/4 ) = xb + j * yb
	126	* x(n+N/2 ) = xc + j * yc
	127	* x(n+3N 4) = xd + j * yd
	128	*
	129	*
	130	* Output real and imaginary data:
	131	* x(4r) = xa'+ j * ya'
	132	* x(4r+1) = xb'+ j * yb'
	133	* x(4r+2) = xc'+ j * yc'
	134	* x(4r+3) = xd'+ j * yd'
	135	*
	136	*
	137	* Twiddle factors for radix-4 FFT:
	138	* Wn = co1 + j * (- si1)
	139	* W2n = co2 + j * (- si2)
	140	* W3n = co3 + j * (- si3)
	141
	142	* The real and imaginary output values for the radix-4 butterfly are
	143	* xa' = xa + xb + xc + xd
	144	* ya' = ya + yb + yc + yd
	145	* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
	146	* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
	147	* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
	148	* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
	149	* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
	150	* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
	151	*
	152	*/
	153
	154	/**
	155	* @brief Core function for the Q15 CFFT butterfly process.
	156	* @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
	157	* @param[in] fftLen length of the FFT.
	158	* @param[in] *pCoef16 points to twiddle coefficient buffer.
	159	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
	160	* @return none.
	161	*/
	162
	163	void arm_radix4_butterfly_q15(
	164	q15_t * pSrc16,
	165	uint32_t fftLen,
	166	q15_t * pCoef16,
	167	uint32_t twidCoefModifier)
	168	{
	169
	170	#ifndef ARM_MATH_CM0_FAMILY
	171
	172	/* Run the below code for Cortex-M4 and Cortex-M3 */
	173
	174	q31_t R, S, T, U;
	175	q31_t C1, C2, C3, out1, out2;
	176	uint32_t n1, n2, ic, i0, j, k;
	177
	178	q15_t *ptr1;
	179	q15_t *pSi0;
	180	q15_t *pSi1;
	181	q15_t *pSi2;
	182	q15_t *pSi3;
	183
	184	q31_t xaya, xbyb, xcyc, xdyd;
	185
	186	/* Total process is divided into three stages */
	187
	188	/* process first stage, middle stages, & last stage */
	189
	190	/* Initializations for the first stage */
	191	n2 = fftLen;
	192	n1 = n2;
	193
	194	/* n2 = fftLen/4 */
	195	n2 >>= 2u;
	196
	197	/* Index for twiddle coefficient */
	198	ic = 0u;
	199
	200	/* Index for input read and output write */
	201	j = n2;
	202
	203	pSi0 = pSrc16;
	204	pSi1 = pSi0 + 2 * n2;
	205	pSi2 = pSi1 + 2 * n2;
	206	pSi3 = pSi2 + 2 * n2;
	207
	208	/* Input is in 1.15(q15) format */
	209
	210	/* start of first stage process */
	211	do
	212	{
	213	/* Butterfly implementation */
	214
	215	/* Reading i0, i0+fftLen/2 inputs */
	216	/* Read ya (real), xa(imag) input */
	217	T = _SIMD32_OFFSET(pSi0);
	218	T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
	219	T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
	220	//in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
	221	//T = ((T >> 2) & 0xFFFF0000) \| (in & 0xFFFF);
	222
	223	/* Read yc (real), xc(imag) input */
	224	S = _SIMD32_OFFSET(pSi2);
	225	S = __SHADD16(S, 0);
	226	S = __SHADD16(S, 0);
	227
	228	/* R = packed((ya + yc), (xa + xc) ) */
	229	R = __QADD16(T, S);
	230
	231	/* S = packed((ya - yc), (xa - xc) ) */
	232	S = __QSUB16(T, S);
	233
	234	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	235	/* Read yb (real), xb(imag) input */
	236	T = _SIMD32_OFFSET(pSi1);
	237	T = __SHADD16(T, 0);
	238	T = __SHADD16(T, 0);
	239
	240	/* Read yd (real), xd(imag) input */
	241	U = _SIMD32_OFFSET(pSi3);
	242	U = __SHADD16(U, 0);
	243	U = __SHADD16(U, 0);
	244
	245	/* T = packed((yb + yd), (xb + xd) ) */
	246	T = __QADD16(T, U);
	247
	248	/* writing the butterfly processed i0 sample */
	249	/* xa' = xa + xb + xc + xd */
	250	/* ya' = ya + yb + yc + yd */
	251	_SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
	252	pSi0 += 2;
	253
	254	/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
	255	R = __QSUB16(R, T);
	256
	257	/* co2 & si2 are read from SIMD Coefficient pointer */
	258	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
	259
	260	#ifndef ARM_MATH_BIG_ENDIAN
	261
	262	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	263	out1 = __SMUAD(C2, R) >> 16u;
	264	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	265	out2 = __SMUSDX(C2, R);
	266
	267	#else
	268
	269	/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	270	out1 = __SMUSDX(R, C2) >> 16u;
	271	/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	272	out2 = __SMUAD(C2, R);
	273
	274	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	275
	276	/* Reading i0+fftLen/4 */
	277	/* T = packed(yb, xb) */
	278	T = _SIMD32_OFFSET(pSi1);
	279	T = __SHADD16(T, 0);
	280	T = __SHADD16(T, 0);
	281
	282	/* writing the butterfly processed i0 + fftLen/4 sample */
	283	/* writing output(xc', yc') in little endian format */
	284	_SIMD32_OFFSET(pSi1) =
	285	(q31_t) ((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	286	pSi1 += 2;
	287
	288	/* Butterfly calculations */
	289	/* U = packed(yd, xd) */
	290	U = _SIMD32_OFFSET(pSi3);
	291	U = __SHADD16(U, 0);
	292	U = __SHADD16(U, 0);
	293
	294	/* T = packed(yb-yd, xb-xd) */
	295	T = __QSUB16(T, U);
	296
	297	#ifndef ARM_MATH_BIG_ENDIAN
	298
	299	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	300	R = __QASX(S, T);
	301	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	302	S = __QSAX(S, T);
	303
	304	#else
	305
	306	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	307	R = __QSAX(S, T);
	308	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	309	S = __QASX(S, T);
	310
	311	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	312
	313	/* co1 & si1 are read from SIMD Coefficient pointer */
	314	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
	315	/* Butterfly process for the i0+fftLen/2 sample */
	316
	317	#ifndef ARM_MATH_BIG_ENDIAN
	318
	319	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	320	out1 = __SMUAD(C1, S) >> 16u;
	321	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	322	out2 = __SMUSDX(C1, S);
	323
	324	#else
	325
	326	/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	327	out1 = __SMUSDX(S, C1) >> 16u;
	328	/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	329	out2 = __SMUAD(C1, S);
	330
	331	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	332
	333	/* writing output(xb', yb') in little endian format */
	334	_SIMD32_OFFSET(pSi2) =
	335	((out2) & 0xFFFF0000) \| ((out1) & 0x0000FFFF);
	336	pSi2 += 2;
	337
	338
	339	/* co3 & si3 are read from SIMD Coefficient pointer */
	340	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
	341	/* Butterfly process for the i0+3fftLen/4 sample */
	342
	343	#ifndef ARM_MATH_BIG_ENDIAN
	344
	345	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
	346	out1 = __SMUAD(C3, R) >> 16u;
	347	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
	348	out2 = __SMUSDX(C3, R);
	349
	350	#else
	351
	352	/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
	353	out1 = __SMUSDX(R, C3) >> 16u;
	354	/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
	355	out2 = __SMUAD(C3, R);
	356
	357	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	358
	359	/* writing output(xd', yd') in little endian format */
	360	_SIMD32_OFFSET(pSi3) =
	361	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	362	pSi3 += 2;
	363
	364	/* Twiddle coefficients index modifier */
	365	ic = ic + twidCoefModifier;
	366
	367	} while(--j);
	368	/* data is in 4.11(q11) format */
	369
	370	/* end of first stage process */
	371
	372
	373	/* start of middle stage process */
	374
	375	/* Twiddle coefficients index modifier */
	376	twidCoefModifier <<= 2u;
	377
	378	/* Calculation of Middle stage */
	379	for (k = fftLen / 4u; k > 4u; k >>= 2u)
	380	{
	381	/* Initializations for the middle stage */
	382	n1 = n2;
	383	n2 >>= 2u;
	384	ic = 0u;
	385
	386	for (j = 0u; j <= (n2 - 1u); j++)
	387	{
	388	/* index calculation for the coefficients */
	389	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
	390	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
	391	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
	392
	393	/* Twiddle coefficients index modifier */
	394	ic = ic + twidCoefModifier;
	395
	396	pSi0 = pSrc16 + 2 * j;
	397	pSi1 = pSi0 + 2 * n2;
	398	pSi2 = pSi1 + 2 * n2;
	399	pSi3 = pSi2 + 2 * n2;
	400
	401	/* Butterfly implementation */
	402	for (i0 = j; i0 < fftLen; i0 += n1)
	403	{
	404	/* Reading i0, i0+fftLen/2 inputs */
	405	/* Read ya (real), xa(imag) input */
	406	T = _SIMD32_OFFSET(pSi0);
	407
	408	/* Read yc (real), xc(imag) input */
	409	S = _SIMD32_OFFSET(pSi2);
	410
	411	/* R = packed( (ya + yc), (xa + xc)) */
	412	R = __QADD16(T, S);
	413
	414	/* S = packed((ya - yc), (xa - xc)) */
	415	S = __QSUB16(T, S);
	416
	417	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	418	/* Read yb (real), xb(imag) input */
	419	T = _SIMD32_OFFSET(pSi1);
	420
	421	/* Read yd (real), xd(imag) input */
	422	U = _SIMD32_OFFSET(pSi3);
	423
	424	/* T = packed( (yb + yd), (xb + xd)) */
	425	T = __QADD16(T, U);
	426
	427	/* writing the butterfly processed i0 sample */
	428
	429	/* xa' = xa + xb + xc + xd */
	430	/* ya' = ya + yb + yc + yd */
	431	out1 = __SHADD16(R, T);
	432	out1 = __SHADD16(out1, 0);
	433	_SIMD32_OFFSET(pSi0) = out1;
	434	pSi0 += 2 * n1;
	435
	436	/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
	437	R = __SHSUB16(R, T);
	438
	439	#ifndef ARM_MATH_BIG_ENDIAN
	440
	441	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
	442	out1 = __SMUAD(C2, R) >> 16u;
	443
	444	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	445	out2 = __SMUSDX(C2, R);
	446
	447	#else
	448
	449	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	450	out1 = __SMUSDX(R, C2) >> 16u;
	451
	452	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
	453	out2 = __SMUAD(C2, R);
	454
	455	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	456
	457	/* Reading i0+3fftLen/4 */
	458	/* Read yb (real), xb(imag) input */
	459	T = _SIMD32_OFFSET(pSi1);
	460
	461	/* writing the butterfly processed i0 + fftLen/4 sample */
	462	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	463	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	464	_SIMD32_OFFSET(pSi1) =
	465	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	466	pSi1 += 2 * n1;
	467
	468	/* Butterfly calculations */
	469
	470	/* Read yd (real), xd(imag) input */
	471	U = _SIMD32_OFFSET(pSi3);
	472
	473	/* T = packed(yb-yd, xb-xd) */
	474	T = __QSUB16(T, U);
	475
	476	#ifndef ARM_MATH_BIG_ENDIAN
	477
	478	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	479	R = __SHASX(S, T);
	480
	481	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	482	S = __SHSAX(S, T);
	483
	484
	485	/* Butterfly process for the i0+fftLen/2 sample */
	486	out1 = __SMUAD(C1, S) >> 16u;
	487	out2 = __SMUSDX(C1, S);
	488
	489	#else
	490
	491	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	492	R = __SHSAX(S, T);
	493
	494	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	495	S = __SHASX(S, T);
	496
	497
	498	/* Butterfly process for the i0+fftLen/2 sample */
	499	out1 = __SMUSDX(S, C1) >> 16u;
	500	out2 = __SMUAD(C1, S);
	501
	502	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	503
	504	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	505	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	506	_SIMD32_OFFSET(pSi2) =
	507	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	508	pSi2 += 2 * n1;
	509
	510	/* Butterfly process for the i0+3fftLen/4 sample */
	511
	512	#ifndef ARM_MATH_BIG_ENDIAN
	513
	514	out1 = __SMUAD(C3, R) >> 16u;
	515	out2 = __SMUSDX(C3, R);
	516
	517	#else
	518
	519	out1 = __SMUSDX(R, C3) >> 16u;
	520	out2 = __SMUAD(C3, R);
	521
	522	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	523
	524	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
	525	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
	526	_SIMD32_OFFSET(pSi3) =
	527	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	528	pSi3 += 2 * n1;
	529	}
	530	}
	531	/* Twiddle coefficients index modifier */
	532	twidCoefModifier <<= 2u;
	533	}
	534	/* end of middle stage process */
	535
	536
	537	/* data is in 10.6(q6) format for the 1024 point */
	538	/* data is in 8.8(q8) format for the 256 point */
	539	/* data is in 6.10(q10) format for the 64 point */
	540	/* data is in 4.12(q12) format for the 16 point */
	541
	542	/* Initializations for the last stage */
	543	j = fftLen >> 2;
	544
	545	ptr1 = &pSrc16[0];
	546
	547	/* start of last stage process */
	548
	549	/* Butterfly implementation */
	550	do
	551	{
	552	/* Read xa (real), ya(imag) input */
	553	xaya = *__SIMD32(ptr1)++;
	554
	555	/* Read xb (real), yb(imag) input */
	556	xbyb = *__SIMD32(ptr1)++;
	557
	558	/* Read xc (real), yc(imag) input */
	559	xcyc = *__SIMD32(ptr1)++;
	560
	561	/* Read xd (real), yd(imag) input */
	562	xdyd = *__SIMD32(ptr1)++;
	563
	564	/* R = packed((ya + yc), (xa + xc)) */
	565	R = __QADD16(xaya, xcyc);
	566
	567	/* T = packed((yb + yd), (xb + xd)) */
	568	T = __QADD16(xbyb, xdyd);
	569
	570	/* pointer updation for writing */
	571	ptr1 = ptr1 - 8u;
	572
	573
	574	/* xa' = xa + xb + xc + xd */
	575	/* ya' = ya + yb + yc + yd */
	576	*__SIMD32(ptr1)++ = __SHADD16(R, T);
	577
	578	/* T = packed((yb + yd), (xb + xd)) */
	579	T = __QADD16(xbyb, xdyd);
	580
	581	/* xc' = (xa-xb+xc-xd) */
	582	/* yc' = (ya-yb+yc-yd) */
	583	*__SIMD32(ptr1)++ = __SHSUB16(R, T);
	584
	585	/* S = packed((ya - yc), (xa - xc)) */
	586	S = __QSUB16(xaya, xcyc);
	587
	588	/* Read yd (real), xd(imag) input */
	589	/* T = packed( (yb - yd), (xb - xd)) */
	590	U = __QSUB16(xbyb, xdyd);
	591
	592	#ifndef ARM_MATH_BIG_ENDIAN
	593
	594	/* xb' = (xa+yb-xc-yd) */
	595	/* yb' = (ya-xb-yc+xd) */
	596	*__SIMD32(ptr1)++ = __SHSAX(S, U);
	597
	598
	599	/* xd' = (xa-yb-xc+yd) */
	600	/* yd' = (ya+xb-yc-xd) */
	601	*__SIMD32(ptr1)++ = __SHASX(S, U);
	602
	603	#else
	604
	605	/* xb' = (xa+yb-xc-yd) */
	606	/* yb' = (ya-xb-yc+xd) */
	607	*__SIMD32(ptr1)++ = __SHASX(S, U);
	608
	609
	610	/* xd' = (xa-yb-xc+yd) */
	611	/* yd' = (ya+xb-yc-xd) */
	612	*__SIMD32(ptr1)++ = __SHSAX(S, U);
	613
	614	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	615
	616	} while(--j);
	617
	618	/* end of last stage process */
	619
	620	/* output is in 11.5(q5) format for the 1024 point */
	621	/* output is in 9.7(q7) format for the 256 point */
	622	/* output is in 7.9(q9) format for the 64 point */
	623	/* output is in 5.11(q11) format for the 16 point */
	624
	625
	626	#else
	627
	628	/* Run the below code for Cortex-M0 */
	629
	630	q15_t R0, R1, S0, S1, T0, T1, U0, U1;
	631	q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
	632	uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
	633
	634	/* Total process is divided into three stages */
	635
	636	/* process first stage, middle stages, & last stage */
	637
	638	/* Initializations for the first stage */
	639	n2 = fftLen;
	640	n1 = n2;
	641
	642	/* n2 = fftLen/4 */
	643	n2 >>= 2u;
	644
	645	/* Index for twiddle coefficient */
	646	ic = 0u;
	647
	648	/* Index for input read and output write */
	649	i0 = 0u;
	650	j = n2;
	651
	652	/* Input is in 1.15(q15) format */
	653
	654	/* start of first stage process */
	655	do
	656	{
	657	/* Butterfly implementation */
	658
	659	/* index calculation for the input as, */
	660	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
	661	i1 = i0 + n2;
	662	i2 = i1 + n2;
	663	i3 = i2 + n2;
	664
	665	/* Reading i0, i0+fftLen/2 inputs */
	666
	667	/* input is down scale by 4 to avoid overflow */
	668	/* Read ya (real), xa(imag) input */
	669	T0 = pSrc16[i0 * 2u] >> 2u;
	670	T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
	671
	672	/* input is down scale by 4 to avoid overflow */
	673	/* Read yc (real), xc(imag) input */
	674	S0 = pSrc16[i2 * 2u] >> 2u;
	675	S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
	676
	677	/* R0 = (ya + yc) */
	678	R0 = __SSAT(T0 + S0, 16u);
	679	/* R1 = (xa + xc) */
	680	R1 = __SSAT(T1 + S1, 16u);
	681
	682	/* S0 = (ya - yc) */
	683	S0 = __SSAT(T0 - S0, 16);
	684	/* S1 = (xa - xc) */
	685	S1 = __SSAT(T1 - S1, 16);
	686
	687	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	688	/* input is down scale by 4 to avoid overflow */
	689	/* Read yb (real), xb(imag) input */
	690	T0 = pSrc16[i1 * 2u] >> 2u;
	691	T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
	692
	693	/* input is down scale by 4 to avoid overflow */
	694	/* Read yd (real), xd(imag) input */
	695	U0 = pSrc16[i3 * 2u] >> 2u;
	696	U1 = pSrc16[(i3 * 2u) + 1] >> 2u;
	697
	698	/* T0 = (yb + yd) */
	699	T0 = __SSAT(T0 + U0, 16u);
	700	/* T1 = (xb + xd) */
	701	T1 = __SSAT(T1 + U1, 16u);
	702
	703	/* writing the butterfly processed i0 sample */
	704	/* ya' = ya + yb + yc + yd */
	705	/* xa' = xa + xb + xc + xd */
	706	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
	707	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
	708
	709	/* R0 = (ya + yc) - (yb + yd) */
	710	/* R1 = (xa + xc) - (xb + xd) */
	711	R0 = __SSAT(R0 - T0, 16u);
	712	R1 = __SSAT(R1 - T1, 16u);
	713
	714	/* co2 & si2 are read from Coefficient pointer */
	715	Co2 = pCoef16[2u * ic * 2u];
	716	Si2 = pCoef16[(2u * ic * 2u) + 1];
	717
	718	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	719	out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
	720	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	721	out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
	722
	723	/* Reading i0+fftLen/4 */
	724	/* input is down scale by 4 to avoid overflow */
	725	/* T0 = yb, T1 = xb */
	726	T0 = pSrc16[i1 * 2u] >> 2;
	727	T1 = pSrc16[(i1 * 2u) + 1] >> 2;
	728
	729	/* writing the butterfly processed i0 + fftLen/4 sample */
	730	/* writing output(xc', yc') in little endian format */
	731	pSrc16[i1 * 2u] = out1;
	732	pSrc16[(i1 * 2u) + 1] = out2;
	733
	734	/* Butterfly calculations */
	735	/* input is down scale by 4 to avoid overflow */
	736	/* U0 = yd, U1 = xd */
	737	U0 = pSrc16[i3 * 2u] >> 2;
	738	U1 = pSrc16[(i3 * 2u) + 1] >> 2;
	739	/* T0 = yb-yd */
	740	T0 = __SSAT(T0 - U0, 16);
	741	/* T1 = xb-xd */
	742	T1 = __SSAT(T1 - U1, 16);
	743
	744	/* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
	745	R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
	746	R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
	747
	748	/* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
	749	S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u);
	750	S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u);
	751
	752	/* co1 & si1 are read from Coefficient pointer */
	753	Co1 = pCoef16[ic * 2u];
	754	Si1 = pCoef16[(ic * 2u) + 1];
	755	/* Butterfly process for the i0+fftLen/2 sample */
	756	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	757	out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
	758	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	759	out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
	760
	761	/* writing output(xb', yb') in little endian format */
	762	pSrc16[i2 * 2u] = out1;
	763	pSrc16[(i2 * 2u) + 1] = out2;
	764
	765	/* Co3 & si3 are read from Coefficient pointer */
	766	Co3 = pCoef16[3u * (ic * 2u)];
	767	Si3 = pCoef16[(3u * (ic * 2u)) + 1];
	768	/* Butterfly process for the i0+3fftLen/4 sample */
	769	/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
	770	out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
	771	/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
	772	out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
	773	/* writing output(xd', yd') in little endian format */
	774	pSrc16[i3 * 2u] = out1;
	775	pSrc16[(i3 * 2u) + 1] = out2;
	776
	777	/* Twiddle coefficients index modifier */
	778	ic = ic + twidCoefModifier;
	779
	780	/* Updating input index */
	781	i0 = i0 + 1u;
	782
	783	} while(--j);
	784	/* data is in 4.11(q11) format */
	785
	786	/* end of first stage process */
	787
	788
	789	/* start of middle stage process */
	790
	791	/* Twiddle coefficients index modifier */
	792	twidCoefModifier <<= 2u;
	793
	794	/* Calculation of Middle stage */
	795	for (k = fftLen / 4u; k > 4u; k >>= 2u)
	796	{
	797	/* Initializations for the middle stage */
	798	n1 = n2;
	799	n2 >>= 2u;
	800	ic = 0u;
	801
	802	for (j = 0u; j <= (n2 - 1u); j++)
	803	{
	804	/* index calculation for the coefficients */
	805	Co1 = pCoef16[ic * 2u];
	806	Si1 = pCoef16[(ic * 2u) + 1u];
	807	Co2 = pCoef16[2u * (ic * 2u)];
	808	Si2 = pCoef16[(2u * (ic * 2u)) + 1u];
	809	Co3 = pCoef16[3u * (ic * 2u)];
	810	Si3 = pCoef16[(3u * (ic * 2u)) + 1u];
	811
	812	/* Twiddle coefficients index modifier */
	813	ic = ic + twidCoefModifier;
	814
	815	/* Butterfly implementation */
	816	for (i0 = j; i0 < fftLen; i0 += n1)
	817	{
	818	/* index calculation for the input as, */
	819	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
	820	i1 = i0 + n2;
	821	i2 = i1 + n2;
	822	i3 = i2 + n2;
	823
	824	/* Reading i0, i0+fftLen/2 inputs */
	825	/* Read ya (real), xa(imag) input */
	826	T0 = pSrc16[i0 * 2u];
	827	T1 = pSrc16[(i0 * 2u) + 1u];
	828
	829	/* Read yc (real), xc(imag) input */
	830	S0 = pSrc16[i2 * 2u];
	831	S1 = pSrc16[(i2 * 2u) + 1u];
	832
	833	/* R0 = (ya + yc), R1 = (xa + xc) */
	834	R0 = __SSAT(T0 + S0, 16);
	835	R1 = __SSAT(T1 + S1, 16);
	836
	837	/* S0 = (ya - yc), S1 =(xa - xc) */
	838	S0 = __SSAT(T0 - S0, 16);
	839	S1 = __SSAT(T1 - S1, 16);
	840
	841	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	842	/* Read yb (real), xb(imag) input */
	843	T0 = pSrc16[i1 * 2u];
	844	T1 = pSrc16[(i1 * 2u) + 1u];
	845
	846	/* Read yd (real), xd(imag) input */
	847	U0 = pSrc16[i3 * 2u];
	848	U1 = pSrc16[(i3 * 2u) + 1u];
	849
	850
	851	/* T0 = (yb + yd), T1 = (xb + xd) */
	852	T0 = __SSAT(T0 + U0, 16);
	853	T1 = __SSAT(T1 + U1, 16);
	854
	855	/* writing the butterfly processed i0 sample */
	856
	857	/* xa' = xa + xb + xc + xd */
	858	/* ya' = ya + yb + yc + yd */
	859	out1 = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
	860	out2 = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
	861
	862	pSrc16[i0 * 2u] = out1;
	863	pSrc16[(2u * i0) + 1u] = out2;
	864
	865	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
	866	R0 = (R0 >> 1u) - (T0 >> 1u);
	867	R1 = (R1 >> 1u) - (T1 >> 1u);
	868
	869	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
	870	out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
	871
	872	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	873	out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
	874
	875	/* Reading i0+3fftLen/4 */
	876	/* Read yb (real), xb(imag) input */
	877	T0 = pSrc16[i1 * 2u];
	878	T1 = pSrc16[(i1 * 2u) + 1u];
	879
	880	/* writing the butterfly processed i0 + fftLen/4 sample */
	881	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	882	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	883	pSrc16[i1 * 2u] = out1;
	884	pSrc16[(i1 * 2u) + 1u] = out2;
	885
	886	/* Butterfly calculations */
	887
	888	/* Read yd (real), xd(imag) input */
	889	U0 = pSrc16[i3 * 2u];
	890	U1 = pSrc16[(i3 * 2u) + 1u];
	891
	892	/* T0 = yb-yd, T1 = xb-xd */
	893	T0 = __SSAT(T0 - U0, 16);
	894	T1 = __SSAT(T1 - U1, 16);
	895
	896	/* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
	897	R0 = (S0 >> 1u) - (T1 >> 1u);
	898	R1 = (S1 >> 1u) + (T0 >> 1u);
	899
	900	/* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
	901	S0 = (S0 >> 1u) + (T1 >> 1u);
	902	S1 = (S1 >> 1u) - (T0 >> 1u);
	903
	904	/* Butterfly process for the i0+fftLen/2 sample */
	905	out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u);
	906
	907	out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u);
	908
	909	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	910	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	911	pSrc16[i2 * 2u] = out1;
	912	pSrc16[(i2 * 2u) + 1u] = out2;
	913
	914	/* Butterfly process for the i0+3fftLen/4 sample */
	915	out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
	916
	917	out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
	918	/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
	919	/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
	920	pSrc16[i3 * 2u] = out1;
	921	pSrc16[(i3 * 2u) + 1u] = out2;
	922	}
	923	}
	924	/* Twiddle coefficients index modifier */
	925	twidCoefModifier <<= 2u;
	926	}
	927	/* end of middle stage process */
	928
	929
	930	/* data is in 10.6(q6) format for the 1024 point */
	931	/* data is in 8.8(q8) format for the 256 point */
	932	/* data is in 6.10(q10) format for the 64 point */
	933	/* data is in 4.12(q12) format for the 16 point */
	934
	935	/* Initializations for the last stage */
	936	n1 = n2;
	937	n2 >>= 2u;
	938
	939	/* start of last stage process */
	940
	941	/* Butterfly implementation */
	942	for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
	943	{
	944	/* index calculation for the input as, */
	945	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
	946	i1 = i0 + n2;
	947	i2 = i1 + n2;
	948	i3 = i2 + n2;
	949
	950	/* Reading i0, i0+fftLen/2 inputs */
	951	/* Read ya (real), xa(imag) input */
	952	T0 = pSrc16[i0 * 2u];
	953	T1 = pSrc16[(i0 * 2u) + 1u];
	954
	955	/* Read yc (real), xc(imag) input */
	956	S0 = pSrc16[i2 * 2u];
	957	S1 = pSrc16[(i2 * 2u) + 1u];
	958
	959	/* R0 = (ya + yc), R1 = (xa + xc) */
	960	R0 = __SSAT(T0 + S0, 16u);
	961	R1 = __SSAT(T1 + S1, 16u);
	962
	963	/* S0 = (ya - yc), S1 = (xa - xc) */
	964	S0 = __SSAT(T0 - S0, 16u);
	965	S1 = __SSAT(T1 - S1, 16u);
	966
	967	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	968	/* Read yb (real), xb(imag) input */
	969	T0 = pSrc16[i1 * 2u];
	970	T1 = pSrc16[(i1 * 2u) + 1u];
	971	/* Read yd (real), xd(imag) input */
	972	U0 = pSrc16[i3 * 2u];
	973	U1 = pSrc16[(i3 * 2u) + 1u];
	974
	975	/* T0 = (yb + yd), T1 = (xb + xd)) */
	976	T0 = __SSAT(T0 + U0, 16u);
	977	T1 = __SSAT(T1 + U1, 16u);
	978
	979	/* writing the butterfly processed i0 sample */
	980	/* xa' = xa + xb + xc + xd */
	981	/* ya' = ya + yb + yc + yd */
	982	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
	983	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
	984
	985	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
	986	R0 = (R0 >> 1u) - (T0 >> 1u);
	987	R1 = (R1 >> 1u) - (T1 >> 1u);
	988	/* Read yb (real), xb(imag) input */
	989	T0 = pSrc16[i1 * 2u];
	990	T1 = pSrc16[(i1 * 2u) + 1u];
	991
	992	/* writing the butterfly processed i0 + fftLen/4 sample */
	993	/* xc' = (xa-xb+xc-xd) */
	994	/* yc' = (ya-yb+yc-yd) */
	995	pSrc16[i1 * 2u] = R0;
	996	pSrc16[(i1 * 2u) + 1u] = R1;
	997
	998	/* Read yd (real), xd(imag) input */
	999	U0 = pSrc16[i3 * 2u];
	1000	U1 = pSrc16[(i3 * 2u) + 1u];
	1001	/* T0 = (yb - yd), T1 = (xb - xd) */
	1002	T0 = __SSAT(T0 - U0, 16u);
	1003	T1 = __SSAT(T1 - U1, 16u);
	1004
	1005	/* writing the butterfly processed i0 + fftLen/2 sample */
	1006	/* xb' = (xa+yb-xc-yd) */
	1007	/* yb' = (ya-xb-yc+xd) */
	1008	pSrc16[i2 * 2u] = (S0 >> 1u) + (T1 >> 1u);
	1009	pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
	1010
	1011	/* writing the butterfly processed i0 + 3fftLen/4 sample */
	1012	/* xd' = (xa-yb-xc+yd) */
	1013	/* yd' = (ya+xb-yc-xd) */
	1014	pSrc16[i3 * 2u] = (S0 >> 1u) - (T1 >> 1u);
	1015	pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
	1016
	1017	}
	1018
	1019	/* end of last stage process */
	1020
	1021	/* output is in 11.5(q5) format for the 1024 point */
	1022	/* output is in 9.7(q7) format for the 256 point */
	1023	/* output is in 7.9(q9) format for the 64 point */
	1024	/* output is in 5.11(q11) format for the 16 point */
	1025
	1026	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
	1027
	1028	}
	1029
	1030
	1031	/**
	1032	* @brief Core function for the Q15 CIFFT butterfly process.
	1033	* @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
	1034	* @param[in] fftLen length of the FFT.
	1035	* @param[in] *pCoef16 points to twiddle coefficient buffer.
	1036	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
	1037	* @return none.
	1038	*/
	1039
	1040	/*
	1041	* Radix-4 IFFT algorithm used is :
	1042	*
	1043	* CIFFT uses same twiddle coefficients as CFFT function
	1044	* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
	1045	*
	1046	*
	1047	* IFFT is implemented with following changes in equations from FFT
	1048	*
	1049	* Input real and imaginary data:
	1050	* x(n) = xa + j * ya
	1051	* x(n+N/4 ) = xb + j * yb
	1052	* x(n+N/2 ) = xc + j * yc
	1053	* x(n+3N 4) = xd + j * yd
	1054	*
	1055	*
	1056	* Output real and imaginary data:
	1057	* x(4r) = xa'+ j * ya'
	1058	* x(4r+1) = xb'+ j * yb'
	1059	* x(4r+2) = xc'+ j * yc'
	1060	* x(4r+3) = xd'+ j * yd'
	1061	*
	1062	*
	1063	* Twiddle factors for radix-4 IFFT:
	1064	* Wn = co1 + j * (si1)
	1065	* W2n = co2 + j * (si2)
	1066	* W3n = co3 + j * (si3)
	1067
	1068	* The real and imaginary output values for the radix-4 butterfly are
	1069	* xa' = xa + xb + xc + xd
	1070	* ya' = ya + yb + yc + yd
	1071	* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
	1072	* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
	1073	* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
	1074	* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
	1075	* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
	1076	* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
	1077	*
	1078	*/
	1079
	1080	void arm_radix4_butterfly_inverse_q15(
	1081	q15_t * pSrc16,
	1082	uint32_t fftLen,
	1083	q15_t * pCoef16,
	1084	uint32_t twidCoefModifier)
	1085	{
	1086
	1087	#ifndef ARM_MATH_CM0_FAMILY
	1088
	1089	/* Run the below code for Cortex-M4 and Cortex-M3 */
	1090
	1091	q31_t R, S, T, U;
	1092	q31_t C1, C2, C3, out1, out2;
	1093	uint32_t n1, n2, ic, i0, j, k;
	1094
	1095	q15_t *ptr1;
	1096	q15_t *pSi0;
	1097	q15_t *pSi1;
	1098	q15_t *pSi2;
	1099	q15_t *pSi3;
	1100
	1101	q31_t xaya, xbyb, xcyc, xdyd;
	1102
	1103	/* Total process is divided into three stages */
	1104
	1105	/* process first stage, middle stages, & last stage */
	1106
	1107	/* Initializations for the first stage */
	1108	n2 = fftLen;
	1109	n1 = n2;
	1110
	1111	/* n2 = fftLen/4 */
	1112	n2 >>= 2u;
	1113
	1114	/* Index for twiddle coefficient */
	1115	ic = 0u;
	1116
	1117	/* Index for input read and output write */
	1118	j = n2;
	1119
	1120	pSi0 = pSrc16;
	1121	pSi1 = pSi0 + 2 * n2;
	1122	pSi2 = pSi1 + 2 * n2;
	1123	pSi3 = pSi2 + 2 * n2;
	1124
	1125	/* Input is in 1.15(q15) format */
	1126
	1127	/* start of first stage process */
	1128	do
	1129	{
	1130	/* Butterfly implementation */
	1131
	1132	/* Reading i0, i0+fftLen/2 inputs */
	1133	/* Read ya (real), xa(imag) input */
	1134	T = _SIMD32_OFFSET(pSi0);
	1135	T = __SHADD16(T, 0);
	1136	T = __SHADD16(T, 0);
	1137
	1138	/* Read yc (real), xc(imag) input */
	1139	S = _SIMD32_OFFSET(pSi2);
	1140	S = __SHADD16(S, 0);
	1141	S = __SHADD16(S, 0);
	1142
	1143	/* R = packed((ya + yc), (xa + xc) ) */
	1144	R = __QADD16(T, S);
	1145
	1146	/* S = packed((ya - yc), (xa - xc) ) */
	1147	S = __QSUB16(T, S);
	1148
	1149	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	1150	/* Read yb (real), xb(imag) input */
	1151	T = _SIMD32_OFFSET(pSi1);
	1152	T = __SHADD16(T, 0);
	1153	T = __SHADD16(T, 0);
	1154
	1155	/* Read yd (real), xd(imag) input */
	1156	U = _SIMD32_OFFSET(pSi3);
	1157	U = __SHADD16(U, 0);
	1158	U = __SHADD16(U, 0);
	1159
	1160	/* T = packed((yb + yd), (xb + xd) ) */
	1161	T = __QADD16(T, U);
	1162
	1163	/* writing the butterfly processed i0 sample */
	1164	/* xa' = xa + xb + xc + xd */
	1165	/* ya' = ya + yb + yc + yd */
	1166	_SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
	1167	pSi0 += 2;
	1168
	1169	/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
	1170	R = __QSUB16(R, T);
	1171
	1172	/* co2 & si2 are read from SIMD Coefficient pointer */
	1173	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
	1174
	1175	#ifndef ARM_MATH_BIG_ENDIAN
	1176
	1177	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	1178	out1 = __SMUSD(C2, R) >> 16u;
	1179	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	1180	out2 = __SMUADX(C2, R);
	1181
	1182	#else
	1183
	1184	/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	1185	out1 = __SMUADX(C2, R) >> 16u;
	1186	/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	1187	out2 = __SMUSD(__QSUB16(0, C2), R);
	1188
	1189	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1190
	1191	/* Reading i0+fftLen/4 */
	1192	/* T = packed(yb, xb) */
	1193	T = _SIMD32_OFFSET(pSi1);
	1194	T = __SHADD16(T, 0);
	1195	T = __SHADD16(T, 0);
	1196
	1197	/* writing the butterfly processed i0 + fftLen/4 sample */
	1198	/* writing output(xc', yc') in little endian format */
	1199	_SIMD32_OFFSET(pSi1) =
	1200	(q31_t) ((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	1201	pSi1 += 2;
	1202
	1203	/* Butterfly calculations */
	1204	/* U = packed(yd, xd) */
	1205	U = _SIMD32_OFFSET(pSi3);
	1206	U = __SHADD16(U, 0);
	1207	U = __SHADD16(U, 0);
	1208
	1209	/* T = packed(yb-yd, xb-xd) */
	1210	T = __QSUB16(T, U);
	1211
	1212	#ifndef ARM_MATH_BIG_ENDIAN
	1213
	1214	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	1215	R = __QSAX(S, T);
	1216	/* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
	1217	S = __QASX(S, T);
	1218
	1219	#else
	1220
	1221	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	1222	R = __QASX(S, T);
	1223	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	1224	S = __QSAX(S, T);
	1225
	1226	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1227
	1228	/* co1 & si1 are read from SIMD Coefficient pointer */
	1229	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
	1230	/* Butterfly process for the i0+fftLen/2 sample */
	1231
	1232	#ifndef ARM_MATH_BIG_ENDIAN
	1233
	1234	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	1235	out1 = __SMUSD(C1, S) >> 16u;
	1236	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	1237	out2 = __SMUADX(C1, S);
	1238
	1239	#else
	1240
	1241	/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	1242	out1 = __SMUADX(C1, S) >> 16u;
	1243	/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	1244	out2 = __SMUSD(__QSUB16(0, C1), S);
	1245
	1246	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1247
	1248	/* writing output(xb', yb') in little endian format */
	1249	_SIMD32_OFFSET(pSi2) =
	1250	((out2) & 0xFFFF0000) \| ((out1) & 0x0000FFFF);
	1251	pSi2 += 2;
	1252
	1253
	1254	/* co3 & si3 are read from SIMD Coefficient pointer */
	1255	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
	1256	/* Butterfly process for the i0+3fftLen/4 sample */
	1257
	1258	#ifndef ARM_MATH_BIG_ENDIAN
	1259
	1260	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
	1261	out1 = __SMUSD(C3, R) >> 16u;
	1262	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
	1263	out2 = __SMUADX(C3, R);
	1264
	1265	#else
	1266
	1267	/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
	1268	out1 = __SMUADX(C3, R) >> 16u;
	1269	/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
	1270	out2 = __SMUSD(__QSUB16(0, C3), R);
	1271
	1272	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1273
	1274	/* writing output(xd', yd') in little endian format */
	1275	_SIMD32_OFFSET(pSi3) =
	1276	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	1277	pSi3 += 2;
	1278
	1279	/* Twiddle coefficients index modifier */
	1280	ic = ic + twidCoefModifier;
	1281
	1282	} while(--j);
	1283	/* data is in 4.11(q11) format */
	1284
	1285	/* end of first stage process */
	1286
	1287
	1288	/* start of middle stage process */
	1289
	1290	/* Twiddle coefficients index modifier */
	1291	twidCoefModifier <<= 2u;
	1292
	1293	/* Calculation of Middle stage */
	1294	for (k = fftLen / 4u; k > 4u; k >>= 2u)
	1295	{
	1296	/* Initializations for the middle stage */
	1297	n1 = n2;
	1298	n2 >>= 2u;
	1299	ic = 0u;
	1300
	1301	for (j = 0u; j <= (n2 - 1u); j++)
	1302	{
	1303	/* index calculation for the coefficients */
	1304	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
	1305	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
	1306	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
	1307
	1308	/* Twiddle coefficients index modifier */
	1309	ic = ic + twidCoefModifier;
	1310
	1311	pSi0 = pSrc16 + 2 * j;
	1312	pSi1 = pSi0 + 2 * n2;
	1313	pSi2 = pSi1 + 2 * n2;
	1314	pSi3 = pSi2 + 2 * n2;
	1315
	1316	/* Butterfly implementation */
	1317	for (i0 = j; i0 < fftLen; i0 += n1)
	1318	{
	1319	/* Reading i0, i0+fftLen/2 inputs */
	1320	/* Read ya (real), xa(imag) input */
	1321	T = _SIMD32_OFFSET(pSi0);
	1322
	1323	/* Read yc (real), xc(imag) input */
	1324	S = _SIMD32_OFFSET(pSi2);
	1325
	1326	/* R = packed( (ya + yc), (xa + xc)) */
	1327	R = __QADD16(T, S);
	1328
	1329	/* S = packed((ya - yc), (xa - xc)) */
	1330	S = __QSUB16(T, S);
	1331
	1332	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	1333	/* Read yb (real), xb(imag) input */
	1334	T = _SIMD32_OFFSET(pSi1);
	1335
	1336	/* Read yd (real), xd(imag) input */
	1337	U = _SIMD32_OFFSET(pSi3);
	1338
	1339	/* T = packed( (yb + yd), (xb + xd)) */
	1340	T = __QADD16(T, U);
	1341
	1342	/* writing the butterfly processed i0 sample */
	1343
	1344	/* xa' = xa + xb + xc + xd */
	1345	/* ya' = ya + yb + yc + yd */
	1346	out1 = __SHADD16(R, T);
	1347	out1 = __SHADD16(out1, 0);
	1348	_SIMD32_OFFSET(pSi0) = out1;
	1349	pSi0 += 2 * n1;
	1350
	1351	/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
	1352	R = __SHSUB16(R, T);
	1353
	1354	#ifndef ARM_MATH_BIG_ENDIAN
	1355
	1356	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
	1357	out1 = __SMUSD(C2, R) >> 16u;
	1358
	1359	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	1360	out2 = __SMUADX(C2, R);
	1361
	1362	#else
	1363
	1364	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	1365	out1 = __SMUADX(R, C2) >> 16u;
	1366
	1367	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
	1368	out2 = __SMUSD(__QSUB16(0, C2), R);
	1369
	1370	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1371
	1372	/* Reading i0+3fftLen/4 */
	1373	/* Read yb (real), xb(imag) input */
	1374	T = _SIMD32_OFFSET(pSi1);
	1375
	1376	/* writing the butterfly processed i0 + fftLen/4 sample */
	1377	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
	1378	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
	1379	_SIMD32_OFFSET(pSi1) =
	1380	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	1381	pSi1 += 2 * n1;
	1382
	1383	/* Butterfly calculations */
	1384
	1385	/* Read yd (real), xd(imag) input */
	1386	U = _SIMD32_OFFSET(pSi3);
	1387
	1388	/* T = packed(yb-yd, xb-xd) */
	1389	T = __QSUB16(T, U);
	1390
	1391	#ifndef ARM_MATH_BIG_ENDIAN
	1392
	1393	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	1394	R = __SHSAX(S, T);
	1395
	1396	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	1397	S = __SHASX(S, T);
	1398
	1399
	1400	/* Butterfly process for the i0+fftLen/2 sample */
	1401	out1 = __SMUSD(C1, S) >> 16u;
	1402	out2 = __SMUADX(C1, S);
	1403
	1404	#else
	1405
	1406	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
	1407	R = __SHASX(S, T);
	1408
	1409	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
	1410	S = __SHSAX(S, T);
	1411
	1412
	1413	/* Butterfly process for the i0+fftLen/2 sample */
	1414	out1 = __SMUADX(S, C1) >> 16u;
	1415	out2 = __SMUSD(__QSUB16(0, C1), S);
	1416
	1417	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1418
	1419	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
	1420	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
	1421	_SIMD32_OFFSET(pSi2) =
	1422	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	1423	pSi2 += 2 * n1;
	1424
	1425	/* Butterfly process for the i0+3fftLen/4 sample */
	1426
	1427	#ifndef ARM_MATH_BIG_ENDIAN
	1428
	1429	out1 = __SMUSD(C3, R) >> 16u;
	1430	out2 = __SMUADX(C3, R);
	1431
	1432	#else
	1433
	1434	out1 = __SMUADX(C3, R) >> 16u;
	1435	out2 = __SMUSD(__QSUB16(0, C3), R);
	1436
	1437	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1438
	1439	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
	1440	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
	1441	_SIMD32_OFFSET(pSi3) =
	1442	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
	1443	pSi3 += 2 * n1;
	1444	}
	1445	}
	1446	/* Twiddle coefficients index modifier */
	1447	twidCoefModifier <<= 2u;
	1448	}
	1449	/* end of middle stage process */
	1450
	1451	/* data is in 10.6(q6) format for the 1024 point */
	1452	/* data is in 8.8(q8) format for the 256 point */
	1453	/* data is in 6.10(q10) format for the 64 point */
	1454	/* data is in 4.12(q12) format for the 16 point */
	1455
	1456	/* Initializations for the last stage */
	1457	j = fftLen >> 2;
	1458
	1459	ptr1 = &pSrc16[0];
	1460
	1461	/* start of last stage process */
	1462
	1463	/* Butterfly implementation */
	1464	do
	1465	{
	1466	/* Read xa (real), ya(imag) input */
	1467	xaya = *__SIMD32(ptr1)++;
	1468
	1469	/* Read xb (real), yb(imag) input */
	1470	xbyb = *__SIMD32(ptr1)++;
	1471
	1472	/* Read xc (real), yc(imag) input */
	1473	xcyc = *__SIMD32(ptr1)++;
	1474
	1475	/* Read xd (real), yd(imag) input */
	1476	xdyd = *__SIMD32(ptr1)++;
	1477
	1478	/* R = packed((ya + yc), (xa + xc)) */
	1479	R = __QADD16(xaya, xcyc);
	1480
	1481	/* T = packed((yb + yd), (xb + xd)) */
	1482	T = __QADD16(xbyb, xdyd);
	1483
	1484	/* pointer updation for writing */
	1485	ptr1 = ptr1 - 8u;
	1486
	1487
	1488	/* xa' = xa + xb + xc + xd */
	1489	/* ya' = ya + yb + yc + yd */
	1490	*__SIMD32(ptr1)++ = __SHADD16(R, T);
	1491
	1492	/* T = packed((yb + yd), (xb + xd)) */
	1493	T = __QADD16(xbyb, xdyd);
	1494
	1495	/* xc' = (xa-xb+xc-xd) */
	1496	/* yc' = (ya-yb+yc-yd) */
	1497	*__SIMD32(ptr1)++ = __SHSUB16(R, T);
	1498
	1499	/* S = packed((ya - yc), (xa - xc)) */
	1500	S = __QSUB16(xaya, xcyc);
	1501
	1502	/* Read yd (real), xd(imag) input */
	1503	/* T = packed( (yb - yd), (xb - xd)) */
	1504	U = __QSUB16(xbyb, xdyd);
	1505
	1506	#ifndef ARM_MATH_BIG_ENDIAN
	1507
	1508	/* xb' = (xa+yb-xc-yd) */
	1509	/* yb' = (ya-xb-yc+xd) */
	1510	*__SIMD32(ptr1)++ = __SHASX(S, U);
	1511
	1512
	1513	/* xd' = (xa-yb-xc+yd) */
	1514	/* yd' = (ya+xb-yc-xd) */
	1515	*__SIMD32(ptr1)++ = __SHSAX(S, U);
	1516
	1517	#else
	1518
	1519	/* xb' = (xa+yb-xc-yd) */
	1520	/* yb' = (ya-xb-yc+xd) */
	1521	*__SIMD32(ptr1)++ = __SHSAX(S, U);
	1522
	1523
	1524	/* xd' = (xa-yb-xc+yd) */
	1525	/* yd' = (ya+xb-yc-xd) */
	1526	*__SIMD32(ptr1)++ = __SHASX(S, U);
	1527
	1528
	1529	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
	1530
	1531	} while(--j);
	1532
	1533	/* end of last stage process */
	1534
	1535	/* output is in 11.5(q5) format for the 1024 point */
	1536	/* output is in 9.7(q7) format for the 256 point */
	1537	/* output is in 7.9(q9) format for the 64 point */
	1538	/* output is in 5.11(q11) format for the 16 point */
	1539
	1540
	1541	#else
	1542
	1543	/* Run the below code for Cortex-M0 */
	1544
	1545	q15_t R0, R1, S0, S1, T0, T1, U0, U1;
	1546	q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
	1547	uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
	1548
	1549	/* Total process is divided into three stages */
	1550
	1551	/* process first stage, middle stages, & last stage */
	1552
	1553	/* Initializations for the first stage */
	1554	n2 = fftLen;
	1555	n1 = n2;
	1556
	1557	/* n2 = fftLen/4 */
	1558	n2 >>= 2u;
	1559
	1560	/* Index for twiddle coefficient */
	1561	ic = 0u;
	1562
	1563	/* Index for input read and output write */
	1564	i0 = 0u;
	1565
	1566	j = n2;
	1567
	1568	/* Input is in 1.15(q15) format */
	1569
	1570	/* Start of first stage process */
	1571	do
	1572	{
	1573	/* Butterfly implementation */
	1574
	1575	/* index calculation for the input as, */
	1576	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
	1577	i1 = i0 + n2;
	1578	i2 = i1 + n2;
	1579	i3 = i2 + n2;
	1580
	1581	/* Reading i0, i0+fftLen/2 inputs */
	1582	/* input is down scale by 4 to avoid overflow */
	1583	/* Read ya (real), xa(imag) input */
	1584	T0 = pSrc16[i0 * 2u] >> 2u;
	1585	T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
	1586	/* input is down scale by 4 to avoid overflow */
	1587	/* Read yc (real), xc(imag) input */
	1588	S0 = pSrc16[i2 * 2u] >> 2u;
	1589	S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
	1590
	1591	/* R0 = (ya + yc), R1 = (xa + xc) */
	1592	R0 = __SSAT(T0 + S0, 16u);
	1593	R1 = __SSAT(T1 + S1, 16u);
	1594	/* S0 = (ya - yc), S1 = (xa - xc) */
	1595	S0 = __SSAT(T0 - S0, 16u);
	1596	S1 = __SSAT(T1 - S1, 16u);
	1597
	1598	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	1599	/* input is down scale by 4 to avoid overflow */
	1600	/* Read yb (real), xb(imag) input */
	1601	T0 = pSrc16[i1 * 2u] >> 2u;
	1602	T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
	1603	/* Read yd (real), xd(imag) input */
	1604	/* input is down scale by 4 to avoid overflow */
	1605	U0 = pSrc16[i3 * 2u] >> 2u;
	1606	U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
	1607
	1608	/* T0 = (yb + yd), T1 = (xb + xd) */
	1609	T0 = __SSAT(T0 + U0, 16u);
	1610	T1 = __SSAT(T1 + U1, 16u);
	1611
	1612	/* writing the butterfly processed i0 sample */
	1613	/* xa' = xa + xb + xc + xd */
	1614	/* ya' = ya + yb + yc + yd */
	1615	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
	1616	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
	1617
	1618	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
	1619	R0 = __SSAT(R0 - T0, 16u);
	1620	R1 = __SSAT(R1 - T1, 16u);
	1621	/* co2 & si2 are read from Coefficient pointer */
	1622	Co2 = pCoef16[2u * ic * 2u];
	1623	Si2 = pCoef16[(2u * ic * 2u) + 1u];
	1624	/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
	1625	out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u);
	1626	/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
	1627	out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u);
	1628
	1629	/* Reading i0+fftLen/4 */
	1630	/* input is down scale by 4 to avoid overflow */
	1631	/* T0 = yb, T1 = xb */
	1632	T0 = pSrc16[i1 * 2u] >> 2u;
	1633	T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
	1634
	1635	/* writing the butterfly processed i0 + fftLen/4 sample */
	1636	/* writing output(xc', yc') in little endian format */
	1637	pSrc16[i1 * 2u] = out1;
	1638	pSrc16[(i1 * 2u) + 1u] = out2;
	1639
	1640	/* Butterfly calculations */
	1641	/* input is down scale by 4 to avoid overflow */
	1642	/* U0 = yd, U1 = xd) */
	1643	U0 = pSrc16[i3 * 2u] >> 2u;
	1644	U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
	1645
	1646	/* T0 = yb-yd, T1 = xb-xd) */
	1647	T0 = __SSAT(T0 - U0, 16u);
	1648	T1 = __SSAT(T1 - U1, 16u);
	1649	/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
	1650	R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
	1651	R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
	1652	/* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
	1653	S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
	1654	S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
	1655
	1656	/* co1 & si1 are read from Coefficient pointer */
	1657	Co1 = pCoef16[ic * 2u];
	1658	Si1 = pCoef16[(ic * 2u) + 1u];
	1659	/* Butterfly process for the i0+fftLen/2 sample */
	1660	/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
	1661	out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
	1662	/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
	1663	out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
	1664	/* writing output(xb', yb') in little endian format */
	1665	pSrc16[i2 * 2u] = out1;
	1666	pSrc16[(i2 * 2u) + 1u] = out2;
	1667
	1668	/* Co3 & si3 are read from Coefficient pointer */
	1669	Co3 = pCoef16[3u * ic * 2u];
	1670	Si3 = pCoef16[(3u * ic * 2u) + 1u];
	1671	/* Butterfly process for the i0+3fftLen/4 sample */
	1672	/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
	1673	out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
	1674	/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
	1675	out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
	1676	/* writing output(xd', yd') in little endian format */
	1677	pSrc16[i3 * 2u] = out1;
	1678	pSrc16[(i3 * 2u) + 1u] = out2;
	1679
	1680	/* Twiddle coefficients index modifier */
	1681	ic = ic + twidCoefModifier;
	1682
	1683	/* Updating input index */
	1684	i0 = i0 + 1u;
	1685
	1686	} while(--j);
	1687
	1688	/* End of first stage process */
	1689
	1690	/* data is in 4.11(q11) format */
	1691
	1692
	1693	/* Start of Middle stage process */
	1694
	1695	/* Twiddle coefficients index modifier */
	1696	twidCoefModifier <<= 2u;
	1697
	1698	/* Calculation of Middle stage */
	1699	for (k = fftLen / 4u; k > 4u; k >>= 2u)
	1700	{
	1701	/* Initializations for the middle stage */
	1702	n1 = n2;
	1703	n2 >>= 2u;
	1704	ic = 0u;
	1705
	1706	for (j = 0u; j <= (n2 - 1u); j++)
	1707	{
	1708	/* index calculation for the coefficients */
	1709	Co1 = pCoef16[ic * 2u];
	1710	Si1 = pCoef16[(ic * 2u) + 1u];
	1711	Co2 = pCoef16[2u * ic * 2u];
	1712	Si2 = pCoef16[2u * ic * 2u + 1u];
	1713	Co3 = pCoef16[3u * ic * 2u];
	1714	Si3 = pCoef16[(3u * ic * 2u) + 1u];
	1715
	1716	/* Twiddle coefficients index modifier */
	1717	ic = ic + twidCoefModifier;
	1718
	1719	/* Butterfly implementation */
	1720	for (i0 = j; i0 < fftLen; i0 += n1)
	1721	{
	1722	/* index calculation for the input as, */
	1723	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
	1724	i1 = i0 + n2;
	1725	i2 = i1 + n2;
	1726	i3 = i2 + n2;
	1727
	1728	/* Reading i0, i0+fftLen/2 inputs */
	1729	/* Read ya (real), xa(imag) input */
	1730	T0 = pSrc16[i0 * 2u];
	1731	T1 = pSrc16[(i0 * 2u) + 1u];
	1732
	1733	/* Read yc (real), xc(imag) input */
	1734	S0 = pSrc16[i2 * 2u];
	1735	S1 = pSrc16[(i2 * 2u) + 1u];
	1736
	1737
	1738	/* R0 = (ya + yc), R1 = (xa + xc) */
	1739	R0 = __SSAT(T0 + S0, 16u);
	1740	R1 = __SSAT(T1 + S1, 16u);
	1741	/* S0 = (ya - yc), S1 = (xa - xc) */
	1742	S0 = __SSAT(T0 - S0, 16u);
	1743	S1 = __SSAT(T1 - S1, 16u);
	1744
	1745	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	1746	/* Read yb (real), xb(imag) input */
	1747	T0 = pSrc16[i1 * 2u];
	1748	T1 = pSrc16[(i1 * 2u) + 1u];
	1749
	1750	/* Read yd (real), xd(imag) input */
	1751	U0 = pSrc16[i3 * 2u];
	1752	U1 = pSrc16[(i3 * 2u) + 1u];
	1753
	1754	/* T0 = (yb + yd), T1 = (xb + xd) */
	1755	T0 = __SSAT(T0 + U0, 16u);
	1756	T1 = __SSAT(T1 + U1, 16u);
	1757
	1758	/* writing the butterfly processed i0 sample */
	1759	/* xa' = xa + xb + xc + xd */
	1760	/* ya' = ya + yb + yc + yd */
	1761	pSrc16[i0 * 2u] = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
	1762	pSrc16[(i0 * 2u) + 1u] = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
	1763
	1764	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
	1765	R0 = (R0 >> 1u) - (T0 >> 1u);
	1766	R1 = (R1 >> 1u) - (T1 >> 1u);
	1767
	1768	/* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
	1769	out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
	1770	/* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
	1771	out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
	1772
	1773	/* Reading i0+3fftLen/4 */
	1774	/* Read yb (real), xb(imag) input */
	1775	T0 = pSrc16[i1 * 2u];
	1776	T1 = pSrc16[(i1 * 2u) + 1u];
	1777
	1778	/* writing the butterfly processed i0 + fftLen/4 sample */
	1779	/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
	1780	/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
	1781	pSrc16[i1 * 2u] = out1;
	1782	pSrc16[(i1 * 2u) + 1u] = out2;
	1783
	1784	/* Butterfly calculations */
	1785	/* Read yd (real), xd(imag) input */
	1786	U0 = pSrc16[i3 * 2u];
	1787	U1 = pSrc16[(i3 * 2u) + 1u];
	1788
	1789	/* T0 = yb-yd, T1 = xb-xd) */
	1790	T0 = __SSAT(T0 - U0, 16u);
	1791	T1 = __SSAT(T1 - U1, 16u);
	1792
	1793	/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
	1794	R0 = (S0 >> 1u) + (T1 >> 1u);
	1795	R1 = (S1 >> 1u) - (T0 >> 1u);
	1796
	1797	/* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
	1798	S0 = (S0 >> 1u) - (T1 >> 1u);
	1799	S1 = (S1 >> 1u) + (T0 >> 1u);
	1800
	1801	/* Butterfly process for the i0+fftLen/2 sample */
	1802	out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
	1803	out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
	1804	/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
	1805	/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
	1806	pSrc16[i2 * 2u] = out1;
	1807	pSrc16[(i2 * 2u) + 1u] = out2;
	1808
	1809	/* Butterfly process for the i0+3fftLen/4 sample */
	1810	out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
	1811
	1812	out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
	1813	/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
	1814	/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
	1815	pSrc16[i3 * 2u] = out1;
	1816	pSrc16[(i3 * 2u) + 1u] = out2;
	1817
	1818
	1819	}
	1820	}
	1821	/* Twiddle coefficients index modifier */
	1822	twidCoefModifier <<= 2u;
	1823	}
	1824	/* End of Middle stages process */
	1825
	1826
	1827	/* data is in 10.6(q6) format for the 1024 point */
	1828	/* data is in 8.8(q8) format for the 256 point */
	1829	/* data is in 6.10(q10) format for the 64 point */
	1830	/* data is in 4.12(q12) format for the 16 point */
	1831
	1832	/* start of last stage process */
	1833
	1834
	1835	/* Initializations for the last stage */
	1836	n1 = n2;
	1837	n2 >>= 2u;
	1838
	1839	/* Butterfly implementation */
	1840	for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
	1841	{
	1842	/* index calculation for the input as, */
	1843	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
	1844	i1 = i0 + n2;
	1845	i2 = i1 + n2;
	1846	i3 = i2 + n2;
	1847
	1848	/* Reading i0, i0+fftLen/2 inputs */
	1849	/* Read ya (real), xa(imag) input */
	1850	T0 = pSrc16[i0 * 2u];
	1851	T1 = pSrc16[(i0 * 2u) + 1u];
	1852	/* Read yc (real), xc(imag) input */
	1853	S0 = pSrc16[i2 * 2u];
	1854	S1 = pSrc16[(i2 * 2u) + 1u];
	1855
	1856	/* R0 = (ya + yc), R1 = (xa + xc) */
	1857	R0 = __SSAT(T0 + S0, 16u);
	1858	R1 = __SSAT(T1 + S1, 16u);
	1859	/* S0 = (ya - yc), S1 = (xa - xc) */
	1860	S0 = __SSAT(T0 - S0, 16u);
	1861	S1 = __SSAT(T1 - S1, 16u);
	1862
	1863	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
	1864	/* Read yb (real), xb(imag) input */
	1865	T0 = pSrc16[i1 * 2u];
	1866	T1 = pSrc16[(i1 * 2u) + 1u];
	1867	/* Read yd (real), xd(imag) input */
	1868	U0 = pSrc16[i3 * 2u];
	1869	U1 = pSrc16[(i3 * 2u) + 1u];
	1870
	1871	/* T0 = (yb + yd), T1 = (xb + xd) */
	1872	T0 = __SSAT(T0 + U0, 16u);
	1873	T1 = __SSAT(T1 + U1, 16u);
	1874
	1875	/* writing the butterfly processed i0 sample */
	1876	/* xa' = xa + xb + xc + xd */
	1877	/* ya' = ya + yb + yc + yd */
	1878	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
	1879	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
	1880
	1881	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
	1882	R0 = (R0 >> 1u) - (T0 >> 1u);
	1883	R1 = (R1 >> 1u) - (T1 >> 1u);
	1884
	1885	/* Read yb (real), xb(imag) input */
	1886	T0 = pSrc16[i1 * 2u];
	1887	T1 = pSrc16[(i1 * 2u) + 1u];
	1888
	1889	/* writing the butterfly processed i0 + fftLen/4 sample */
	1890	/* xc' = (xa-xb+xc-xd) */
	1891	/* yc' = (ya-yb+yc-yd) */
	1892	pSrc16[i1 * 2u] = R0;
	1893	pSrc16[(i1 * 2u) + 1u] = R1;
	1894
	1895	/* Read yd (real), xd(imag) input */
	1896	U0 = pSrc16[i3 * 2u];
	1897	U1 = pSrc16[(i3 * 2u) + 1u];
	1898	/* T0 = (yb - yd), T1 = (xb - xd) */
	1899	T0 = __SSAT(T0 - U0, 16u);
	1900	T1 = __SSAT(T1 - U1, 16u);
	1901
	1902	/* writing the butterfly processed i0 + fftLen/2 sample */
	1903	/* xb' = (xa-yb-xc+yd) */
	1904	/* yb' = (ya+xb-yc-xd) */
	1905	pSrc16[i2 * 2u] = (S0 >> 1u) - (T1 >> 1u);
	1906	pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
	1907
	1908
	1909	/* writing the butterfly processed i0 + 3fftLen/4 sample */
	1910	/* xd' = (xa+yb-xc-yd) */
	1911	/* yd' = (ya-xb-yc+xd) */
	1912	pSrc16[i3 * 2u] = (S0 >> 1u) + (T1 >> 1u);
	1913	pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
	1914	}
	1915	/* end of last stage process */
	1916
	1917	/* output is in 11.5(q5) format for the 1024 point */
	1918	/* output is in 9.7(q7) format for the 256 point */
	1919	/* output is in 7.9(q9) format for the 64 point */
	1920	/* output is in 5.11(q11) format for the 16 point */
	1921
	1922	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
	1923
	1924	}