diff --git a/src/math/fft/CMakeLists.txt b/src/math/fft/CMakeLists.txt index 968c9ad38ac8..c605886c335e 100644 --- a/src/math/fft/CMakeLists.txt +++ b/src/math/fft/CMakeLists.txt @@ -11,7 +11,7 @@ if(CONFIG_MATH_32BIT_FFT) endif() if(CONFIG_MATH_FFT_MULTI) - list(APPEND base_files fft_multi.c) + list(APPEND base_files fft_multi.c fft_multi_generic.c fft_multi_hifi3.c) endif() is_zephyr(zephyr) diff --git a/src/math/fft/fft_32_hifi3.c b/src/math/fft/fft_32_hifi3.c index 1ca88c48268d..537c576468fc 100644 --- a/src/math/fft/fft_32_hifi3.c +++ b/src/math/fft/fft_32_hifi3.c @@ -19,13 +19,16 @@ void fft_execute_32(struct fft_plan *plan, bool ifft) ae_int32x2 sample; ae_int32x2 sample1; ae_int32x2 sample2; + ae_int32x2 tw; ae_int32x2 *inx = (ae_int32x2 *)plan->inb32; ae_int32x2 *outx = (ae_int32x2 *)plan->outb32; - ae_int32x2 *outtop; - ae_int32x2 *outbottom; + ae_int32x2 *top_ptr; + ae_int32x2 *bot_ptr; uint16_t *idx = &plan->bit_reverse_idx[0]; - int depth, top, bottom, index; - int i, j, k, m, n; + const int32_t *tw_r; + const int32_t *tw_i; + int depth, i; + int j, k, m, n; int size = plan->size; int len = plan->len; @@ -35,7 +38,6 @@ void fft_execute_32(struct fft_plan *plan, bool ifft) if (!plan->inb32 || !plan->outb32) return; - /* convert to complex conjugate for ifft */ /* step 1: re-arrange input in bit reverse order, and shrink the level to avoid overflow */ if (ifft) { /* convert to complex conjugate for ifft */ @@ -54,43 +56,82 @@ void fft_execute_32(struct fft_plan *plan, bool ifft) } } - /* step 2: loop to do FFT transform in smaller size */ - for (depth = 1; depth <= len; ++depth) { + /* + * Step 2a: First FFT stage (depth=1, m=2, n=1). + * All butterflies use twiddle factor W^0 = 1+0j, + * so the complex multiply is skipped entirely. + */ + top_ptr = outx; + bot_ptr = outx + 1; + for (k = 0; k < size; k += 2) { + sample1 = AE_L32X2_I(top_ptr, 0); + sample2 = AE_L32X2_I(bot_ptr, 0); + sample = AE_ADD32S(sample1, sample2); + AE_S32X2_I(sample, top_ptr, 0); + sample = AE_SUB32S(sample1, sample2); + AE_S32X2_I(sample, bot_ptr, 0); + top_ptr += 2; + bot_ptr += 2; + } + + /* Step 2b: Remaining FFT stages (depth >= 2) */ + for (depth = 2; depth <= len; ++depth) { m = 1 << depth; n = m >> 1; i = FFT_SIZE_MAX >> depth; + top_ptr = outx; + bot_ptr = outx + n; + /* doing FFT transforms in size m */ for (k = 0; k < size; k += m) { - /* doing one FFT transform for size m */ - for (j = 0; j < n; ++j) { - index = i * j; - top = k + j; - bottom = top + n; + /* + * j=0: twiddle factor W^0 = 1+0j, + * butterfly without complex multiply. + */ + sample1 = AE_L32X2_I(top_ptr, 0); + sample = AE_L32X2_I(bot_ptr, 0); + sample2 = AE_ADD32S(sample1, sample); + AE_S32X2_I(sample2, top_ptr, 0); + sample2 = AE_SUB32S(sample1, sample); + AE_S32X2_I(sample2, bot_ptr, 0); + top_ptr++; + bot_ptr++; - /* load twiddle factor to sample1 */ - sample1 = twiddle_real_32[index]; - sample2 = twiddle_imag_32[index]; - sample1 = AE_SEL32_LH(sample1, sample2); + /* j=1..n-1: full butterfly with twiddle multiply */ + tw_r = &twiddle_real_32[i]; + tw_i = &twiddle_imag_32[i]; + for (j = 1; j < n; ++j) { + /* load and combine twiddle factor {real, imag} */ + sample1 = tw_r[0]; + sample2 = tw_i[0]; + tw = AE_SEL32_LH(sample1, sample2); /* calculate the accumulator: twiddle * bottom */ - sample2 = outx[bottom]; - res = AE_MULF32S_HH(sample1, sample2); - AE_MULSF32S_LL(res, sample1, sample2); - res1 = AE_MULF32S_HL(sample1, sample2); - AE_MULAF32S_LH(res1, sample1, sample2); + sample2 = AE_L32X2_I(bot_ptr, 0); + res = AE_MULF32S_HH(tw, sample2); + AE_MULSF32S_LL(res, tw, sample2); + res1 = AE_MULF32S_HL(tw, sample2); + AE_MULAF32S_LH(res1, tw, sample2); sample = AE_ROUND32X2F64SSYM(res, res1); - sample1 = outx[top]; + sample1 = AE_L32X2_I(top_ptr, 0); + /* calculate the top output: top = top + accumulate */ sample2 = AE_ADD32S(sample1, sample); - outtop = outx + top; - AE_S32X2_I(sample2, outtop, 0); + AE_S32X2_I(sample2, top_ptr, 0); /* calculate the bottom output: bottom = top - accumulate */ sample2 = AE_SUB32S(sample1, sample); - outbottom = outx + bottom; - AE_S32X2_I(sample2, outbottom, 0); + AE_S32X2_I(sample2, bot_ptr, 0); + + top_ptr++; + bot_ptr++; + tw_r += i; + tw_i += i; } + /* advance pointers past current group's bottom half */ + top_ptr += n; + bot_ptr += n; } } diff --git a/src/math/fft/fft_multi.c b/src/math/fft/fft_multi.c index 2b01f1632478..b16decca012d 100644 --- a/src/math/fft/fft_multi.c +++ b/src/math/fft/fft_multi.c @@ -19,11 +19,6 @@ LOG_MODULE_REGISTER(math_fft_multi, CONFIG_SOF_LOG_LEVEL); SOF_DEFINE_REG_UUID(math_fft_multi); DECLARE_TR_CTX(math_fft_multi_tr, SOF_UUID(math_fft_multi_uuid), LOG_LEVEL_INFO); -/* Constants for size 3 DFT */ -#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */ -#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */ -#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */ - struct fft_multi_plan *mod_fft_multi_plan_new(struct processing_module *mod, void *inb, void *outb, uint32_t size, int bits) { @@ -143,152 +138,3 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla mod_free(mod, plan->bit_reverse_idx); mod_free(mod, plan); } - -void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y) -{ - const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI}; - const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI}; - struct icomplex32 x[3]; - struct icomplex32 p1, p2, sum; - int i; - - for (i = 0; i < 3; i++) { - x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31); - x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31); - } - - /* - * | 1 1 1 | - * c = | 1 c0 c1 | , x = [ x0 x1 x2 ] - * | 1 c1 c0 | - * - * y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(0) - * y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(0) - * y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(0) - */ - - /* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */ - icomplex32_adds(&x[0], &x[1], &sum); - icomplex32_adds(&x[2], &sum, &y[0]); - - /* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */ - icomplex32_mul(&c0, &x[1], &p1); - icomplex32_mul(&c1, &x[2], &p2); - icomplex32_adds(&p1, &p2, &sum); - icomplex32_adds(&x[0], &sum, &y[1]); - - /* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */ - icomplex32_mul(&c1, &x[1], &p1); - icomplex32_mul(&c0, &x[2], &p2); - icomplex32_adds(&p1, &p2, &sum); - icomplex32_adds(&x[0], &sum, &y[2]); -} - -void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft) -{ - struct icomplex32 x[FFT_MULTI_COUNT_MAX]; - struct icomplex32 y[FFT_MULTI_COUNT_MAX]; - struct icomplex32 t, c; - int i, j, k, m; - - /* Handle 2^N FFT */ - if (plan->num_ffts == 1) { - memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32)); - fft_execute_32(plan->fft_plan[0], ifft); - return; - } - -#ifdef DEBUG_DUMP_TO_FILE - FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w"); - FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w"); - FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w"); - FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w"); -#endif - - /* convert to complex conjugate for IFFT */ - if (ifft) { - for (i = 0; i < plan->total_size; i++) - icomplex32_conj(&plan->inb32[i]); - } - - /* Copy input buffers */ - k = 0; - for (i = 0; i < plan->fft_size; i++) - for (j = 0; j < plan->num_ffts; j++) - plan->tmp_i32[j][i] = plan->inb32[k++]; - - /* Clear output buffers and call individual FFTs*/ - for (j = 0; j < plan->num_ffts; j++) { - memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32)); - fft_execute_32(plan->fft_plan[j], 0); - } - -#ifdef DEBUG_DUMP_TO_FILE - for (j = 0; j < plan->num_ffts; j++) - for (i = 0; i < plan->fft_size; i++) - fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag); -#endif - - /* Multiply with twiddle factors */ - m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size; - for (j = 1; j < plan->num_ffts; j++) { - for (i = 0; i < plan->fft_size; i++) { - c = plan->tmp_o32[j][i]; - k = j * i * m; - t.real = multi_twiddle_real_32[k]; - t.imag = multi_twiddle_imag_32[k]; - //fprintf(fh3, "%d %d\n", t.real, t.imag); - icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]); - } - } - -#ifdef DEBUG_DUMP_TO_FILE - for (j = 0; j < plan->num_ffts; j++) - for (i = 0; i < plan->fft_size; i++) - fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag); -#endif - - /* DFT of size 3 */ - j = plan->fft_size; - k = 2 * plan->fft_size; - for (i = 0; i < plan->fft_size; i++) { - x[0] = plan->tmp_o32[0][i]; - x[1] = plan->tmp_o32[1][i]; - x[2] = plan->tmp_o32[2][i]; - dft3_32(x, y); - plan->outb32[i] = y[0]; - plan->outb32[i + j] = y[1]; - plan->outb32[i + k] = y[2]; - } - -#ifdef DEBUG_DUMP_TO_FILE - for (i = 0; i < plan->total_size; i++) - fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag); -#endif - - /* shift back for IFFT */ - - /* TODO: Check if time shift method for IFFT is more efficient or more accurate - * tmp = 1 / N * fft(X); - * x = tmp([1 N:-1:2]) - */ - if (ifft) { - /* - * no need to divide N as it is already done in the input side - * for Q1.31 format. Instead, we need to multiply N to compensate - * the shrink we did in the FFT transform. - */ - for (i = 0; i < plan->total_size; i++) { - /* Need to negate imag part to match reference */ - plan->outb32[i].imag = -plan->outb32[i].imag; - icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len, - &plan->outb32[i]); - plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3); - plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3); - } - } - -#ifdef DEBUG_DUMP_TO_FILE - fclose(fh1); fclose(fh2); fclose(fh3); fclose(fh4); -#endif -} diff --git a/src/math/fft/fft_multi_generic.c b/src/math/fft/fft_multi_generic.c new file mode 100644 index 000000000000..dd30368bbdbb --- /dev/null +++ b/src/math/fft/fft_multi_generic.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2025-2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +#include +#include +#include +#include + +/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */ +#define FFT_MULTI_TWIDDLE_SIZE 2048 +extern const int32_t multi_twiddle_real_32[]; +extern const int32_t multi_twiddle_imag_32[]; + +/* Constants for size 3 DFT */ +#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */ +#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */ +#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */ + +#ifdef FFT_GENERIC + +void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y) +{ + const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI}; + const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI}; + struct icomplex32 x[3]; + struct icomplex32 p1, p2, sum; + int i; + + for (i = 0; i < 3; i++) { + x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31); + x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31); + } + + /* + * | 1 1 1 | + * c = | 1 c0 c1 | , x = [ x0 x1 x2 ] + * | 1 c1 c0 | + * + * y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(2) + * y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(2) + * y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(2) + */ + + /* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */ + icomplex32_adds(&x[0], &x[1], &sum); + icomplex32_adds(&x[2], &sum, &y[0]); + + /* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */ + icomplex32_mul(&c0, &x[1], &p1); + icomplex32_mul(&c1, &x[2], &p2); + icomplex32_adds(&p1, &p2, &sum); + icomplex32_adds(&x[0], &sum, &y[1]); + + /* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */ + icomplex32_mul(&c1, &x[1], &p1); + icomplex32_mul(&c0, &x[2], &p2); + icomplex32_adds(&p1, &p2, &sum); + icomplex32_adds(&x[0], &sum, &y[2]); +} + +void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft) +{ + struct icomplex32 x[FFT_MULTI_COUNT_MAX]; + struct icomplex32 y[FFT_MULTI_COUNT_MAX]; + struct icomplex32 t, c; + int i, j, k, m; + + /* Handle 2^N FFT */ + if (plan->num_ffts == 1) { + memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32)); + fft_execute_32(plan->fft_plan[0], ifft); + return; + } + +#ifdef DEBUG_DUMP_TO_FILE + FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w"); + FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w"); + FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w"); + FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w"); +#endif + + /* convert to complex conjugate for IFFT */ + if (ifft) { + for (i = 0; i < plan->total_size; i++) + icomplex32_conj(&plan->inb32[i]); + } + + /* Copy input buffers */ + k = 0; + for (i = 0; i < plan->fft_size; i++) + for (j = 0; j < plan->num_ffts; j++) + plan->tmp_i32[j][i] = plan->inb32[k++]; + + /* Clear output buffers and call individual FFTs*/ + for (j = 0; j < plan->num_ffts; j++) { + memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32)); + fft_execute_32(plan->fft_plan[j], 0); + } + +#ifdef DEBUG_DUMP_TO_FILE + for (j = 0; j < plan->num_ffts; j++) + for (i = 0; i < plan->fft_size; i++) + fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag); +#endif + + /* Multiply with twiddle factors */ + m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size; + for (j = 1; j < plan->num_ffts; j++) { + for (i = 0; i < plan->fft_size; i++) { + c = plan->tmp_o32[j][i]; + k = j * i * m; + t.real = multi_twiddle_real_32[k]; + t.imag = multi_twiddle_imag_32[k]; + // fprintf(fh3, "%d %d\n", t.real, t.imag); + icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]); + } + } + +#ifdef DEBUG_DUMP_TO_FILE + for (j = 0; j < plan->num_ffts; j++) + for (i = 0; i < plan->fft_size; i++) + fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag); +#endif + + /* DFT of size 3 */ + j = plan->fft_size; + k = 2 * plan->fft_size; + for (i = 0; i < plan->fft_size; i++) { + x[0] = plan->tmp_o32[0][i]; + x[1] = plan->tmp_o32[1][i]; + x[2] = plan->tmp_o32[2][i]; + dft3_32(x, y); + plan->outb32[i] = y[0]; + plan->outb32[i + j] = y[1]; + plan->outb32[i + k] = y[2]; + } + +#ifdef DEBUG_DUMP_TO_FILE + for (i = 0; i < plan->total_size; i++) + fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag); +#endif + + /* shift back for IFFT */ + + /* TODO: Check if time shift method for IFFT is more efficient or more accurate + * tmp = 1 / N * fft(X); + * x = tmp([1 N:-1:2]) + */ + if (ifft) { + /* + * no need to divide N as it is already done in the input side + * for Q1.31 format. Instead, we need to multiply N to compensate + * the shrink we did in the FFT transform. + */ + for (i = 0; i < plan->total_size; i++) { + /* Need to negate imag part to match reference */ + plan->outb32[i].imag = -plan->outb32[i].imag; + icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len, + &plan->outb32[i]); + plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3); + plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3); + } + } + +#ifdef DEBUG_DUMP_TO_FILE + fclose(fh1); + fclose(fh2); + fclose(fh3); + fclose(fh4); +#endif +} + +#endif /* FFT_GENERIC */ diff --git a/src/math/fft/fft_multi_hifi3.c b/src/math/fft/fft_multi_hifi3.c new file mode 100644 index 000000000000..bd1966168369 --- /dev/null +++ b/src/math/fft/fft_multi_hifi3.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2025-2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * @file fft_multi_hifi3.c + * @brief HiFi3 optimized multi-radix FFT functions. + * + * This file provides HiFi3 optimized implementations of dft3_32() and + * fft_multi_execute_32() using Xtensa HiFi3 intrinsics. These replace + * the generic versions when HiFi3 or HiFi4 hardware is available. + * + * Key optimizations over the generic versions: + * - Packed {real, imag} processing using ae_int32x2 registers + * - 64-bit MAC accumulation for fused complex multiply-add + * - Saturating 32-bit arithmetic via AE_ADD32S/AE_SLAA32S + * - Vectorized conjugate, shift, and scale in the IFFT path + */ + +#include +#include +#include +#include +#include + +/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */ +#define FFT_MULTI_TWIDDLE_SIZE 2048 +extern const int32_t multi_twiddle_real_32[]; +extern const int32_t multi_twiddle_imag_32[]; + +#ifdef FFT_HIFI3 + +#include + +/** @brief Q1.31 constant -0.5 */ +#define DFT3_COEFR -1073741824 +/** @brief Q1.31 constant sqrt(3)/2 */ +#define DFT3_COEFI 1859775393 +/** @brief Q1.31 constant 1/3 */ +#define DFT3_SCALE 715827883 + +/** + * dft3_32() - Compute 3-point DFT of Q1.31 complex data (HiFi3). + * @param x_in Pointer to 3 input complex samples in Q1.31. + * @param y Pointer to 3 output complex samples in Q1.31. + * + * Computes the DFT matrix-vector product: + * + * | 1 1 1 | + * Y = | 1 c0 c1 | * X / 3 + * | 1 c1 c0 | + * + * where c0 = exp(-j*2*pi/3) and c1 = exp(+j*2*pi/3). + * Input is prescaled by 1/3 to prevent overflow. + * + * The two complex multiplies for each output bin are fused + * into a single 64-bit accumulator pair, avoiding intermediate + * rounding and saving instructions. + */ +void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y) +{ + ae_int32x2 *p_in = (ae_int32x2 *)x_in; + ae_int32x2 *p_out = (ae_int32x2 *)y; + ae_int32x2 x0, x1, x2; + ae_int32x2 c0, c1; + ae_int32x2 scale; + ae_int32x2 sum, result; + ae_int64 re, im; + + /* + * Set up DFT3 twiddle factors as packed {H=real, L=imag}. + * c0 = {-0.5, -sqrt(3)/2} + * c1 = {-0.5, +sqrt(3)/2} + */ + scale = AE_MOVDA32(DFT3_COEFR); + c0 = AE_SEL32_LH(scale, AE_MOVDA32(-DFT3_COEFI)); + c1 = AE_SEL32_LH(scale, AE_MOVDA32(DFT3_COEFI)); + + /* Scale factor 1/3, broadcast to both H and L */ + scale = AE_MOVDA32(DFT3_SCALE); + + /* Load input samples as packed {real, imag} */ + x0 = AE_L32X2_I(p_in, 0); + x1 = AE_L32X2_I(p_in, 1 * sizeof(ae_int32x2)); + x2 = AE_L32X2_I(p_in, 2 * sizeof(ae_int32x2)); + + /* Scale all inputs by 1/3 to prevent overflow */ + x0 = AE_MULFP32X2RS(x0, scale); + x1 = AE_MULFP32X2RS(x1, scale); + x2 = AE_MULFP32X2RS(x2, scale); + + /* y[0] = x[0] + x[1] + x[2] */ + sum = AE_ADD32S(x0, x1); + AE_S32X2_I(AE_ADD32S(sum, x2), p_out, 0); + + /* + * y[1] = x[0] + c0 * x[1] + c1 * x[2] + * + * Fuse two complex multiplies and their sum into a single + * 64-bit accumulator pair to avoid intermediate rounding. + * + * Real part: c0.re*x1.re - c0.im*x1.im + c1.re*x2.re - c1.im*x2.im + * Imag part: c0.re*x1.im + c0.im*x1.re + c1.re*x2.im + c1.im*x2.re + */ + re = AE_MULF32S_HH(c0, x1); /* c0.re * x1.re */ + AE_MULSF32S_LL(re, c0, x1); /* -= c0.im * x1.im */ + AE_MULAF32S_HH(re, c1, x2); /* += c1.re * x2.re */ + AE_MULSF32S_LL(re, c1, x2); /* -= c1.im * x2.im */ + + im = AE_MULF32S_HL(c0, x1); /* c0.re * x1.im */ + AE_MULAF32S_LH(im, c0, x1); /* += c0.im * x1.re */ + AE_MULAF32S_HL(im, c1, x2); /* += c1.re * x2.im */ + AE_MULAF32S_LH(im, c1, x2); /* += c1.im * x2.re */ + + result = AE_ROUND32X2F64SSYM(re, im); + AE_S32X2_I(AE_ADD32S(x0, result), p_out, sizeof(ae_int32x2)); + + /* + * y[2] = x[0] + c1 * x[1] + c0 * x[2] + * + * Same structure as y[1] but with swapped coefficients. + */ + re = AE_MULF32S_HH(c1, x1); /* c1.re * x1.re */ + AE_MULSF32S_LL(re, c1, x1); /* -= c1.im * x1.im */ + AE_MULAF32S_HH(re, c0, x2); /* += c0.re * x2.re */ + AE_MULSF32S_LL(re, c0, x2); /* -= c0.im * x2.im */ + + im = AE_MULF32S_HL(c1, x1); /* c1.re * x1.im */ + AE_MULAF32S_LH(im, c1, x1); /* += c1.im * x1.re */ + AE_MULAF32S_HL(im, c0, x2); /* += c0.re * x2.im */ + AE_MULAF32S_LH(im, c0, x2); /* += c0.im * x2.re */ + + result = AE_ROUND32X2F64SSYM(re, im); + AE_S32X2_I(AE_ADD32S(x0, result), p_out, 2 * sizeof(ae_int32x2)); +} + +/** + * fft_multi_execute_32() - Execute multi-radix FFT/IFFT (HiFi3). + * @param plan Pointer to multi-FFT plan. + * @param ifft False for FFT, true for IFFT. + * + * Performs a composite FFT of size N = num_ffts * fft_size (e.g. 3 * 512 = 1536). + * For power-of-two sizes the call is forwarded to fft_execute_32(). + * + * HiFi3 optimizations vs. the generic path: + * - IFFT conjugate uses packed AE_SEL32_HL / AE_NEG32S + * - Twiddle multiply uses fused 64-bit MAC (no intermediate rounding) + * - IFFT shift-back combines negate, shift, and *3 scale in HiFi3 ops + */ +void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft) +{ + struct icomplex32 x[FFT_MULTI_COUNT_MAX]; + struct icomplex32 y[FFT_MULTI_COUNT_MAX]; + ae_int32x2 *p_src; + ae_int32x2 *p_dst; + ae_int32x2 sample, sample_neg, tw, data; + ae_int64 re, im; + int i, j, k, m; + + /* Handle 2^N FFT */ + if (plan->num_ffts == 1) { + memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32)); + fft_execute_32(plan->fft_plan[0], ifft); + return; + } + + /* Convert to complex conjugate for IFFT */ + if (ifft) { + p_src = (ae_int32x2 *)plan->inb32; + for (i = 0; i < plan->total_size; i++) { + AE_L32X2_IP(sample, p_src, 0); + sample_neg = AE_NEG32S(sample); + sample = AE_SEL32_HL(sample, sample_neg); + AE_S32X2_IP(sample, p_src, sizeof(ae_int32x2)); + } + } + + /* Copy input buffers (interleaved -> per-FFT) */ + k = 0; + for (i = 0; i < plan->fft_size; i++) + for (j = 0; j < plan->num_ffts; j++) + plan->tmp_i32[j][i] = plan->inb32[k++]; + + /* Clear output buffers and call individual FFTs */ + for (j = 0; j < plan->num_ffts; j++) { + memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32)); + fft_execute_32(plan->fft_plan[j], 0); + } + + /* Multiply with twiddle factors using HiFi3 complex multiply */ + m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size; + for (j = 1; j < plan->num_ffts; j++) { + p_dst = (ae_int32x2 *)plan->tmp_o32[j]; + for (i = 0; i < plan->fft_size; i++) { + k = j * i * m; + /* + * Build twiddle as packed {H=real, L=imag}. + * Use AE_SEL32_LH to pack two scalar loads. + */ + tw = AE_SEL32_LH(AE_MOVDA32(multi_twiddle_real_32[k]), + AE_MOVDA32(multi_twiddle_imag_32[k])); + + data = AE_L32X2_I(p_dst, 0); + + /* Complex multiply: tw * data */ + re = AE_MULF32S_HH(tw, data); + AE_MULSF32S_LL(re, tw, data); + im = AE_MULF32S_HL(tw, data); + AE_MULAF32S_LH(im, tw, data); + + AE_S32X2_IP(AE_ROUND32X2F64SSYM(re, im), + p_dst, sizeof(ae_int32x2)); + } + } + + /* DFT of size 3 */ + j = plan->fft_size; + k = 2 * plan->fft_size; + for (i = 0; i < plan->fft_size; i++) { + x[0] = plan->tmp_o32[0][i]; + x[1] = plan->tmp_o32[1][i]; + x[2] = plan->tmp_o32[2][i]; + dft3_32(x, y); + plan->outb32[i] = y[0]; + plan->outb32[i + j] = y[1]; + plan->outb32[i + k] = y[2]; + } + + /* Shift back for IFFT */ + if (ifft) { + int len = plan->fft_plan[0]->len; + + p_dst = (ae_int32x2 *)plan->outb32; + for (i = 0; i < plan->total_size; i++) { + AE_L32X2_IP(sample, p_dst, 0); + + /* Negate imag part to match reference */ + sample_neg = AE_NEG32S(sample); + sample = AE_SEL32_HL(sample, sample_neg); + + /* Shift left by FFT length to compensate shrink */ + sample = AE_SLAA32S(sample, len); + + /* Integer multiply by 3 (num_ffts) with saturation */ + data = AE_ADD32S(sample, sample); + sample = AE_ADD32S(data, sample); + + AE_S32X2_IP(sample, p_dst, sizeof(ae_int32x2)); + } + } +} + +#endif /* FFT_HIFI3 */ diff --git a/test/cmocka/src/math/fft/CMakeLists.txt b/test/cmocka/src/math/fft/CMakeLists.txt index 326767ec570e..97ac2ee8438f 100644 --- a/test/cmocka/src/math/fft/CMakeLists.txt +++ b/test/cmocka/src/math/fft/CMakeLists.txt @@ -32,6 +32,8 @@ cmocka_test(fft cmocka_test(dft3 dft3.c ${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi.c + ${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_generic.c + ${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_hifi3.c ${PROJECT_SOURCE_DIR}/src/math/fft/fft_32.c ${PROJECT_SOURCE_DIR}/src/math/fft/fft_common.c ${PROJECT_SOURCE_DIR}/test/cmocka/src/notifier_mocks.c @@ -41,6 +43,8 @@ cmocka_test(dft3 cmocka_test(fft_multi fft_multi.c ${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi.c + ${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_generic.c + ${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_hifi3.c ${PROJECT_SOURCE_DIR}/src/math/fft/fft_common.c ${PROJECT_SOURCE_DIR}/src/math/fft/fft_32.c ${PROJECT_SOURCE_DIR}/test/cmocka/src/notifier_mocks.c