diff --git a/src/math/fft/CMakeLists.txt b/src/math/fft/CMakeLists.txt
index 968c9ad38ac8..c605886c335e 100644
--- a/src/math/fft/CMakeLists.txt
+++ b/src/math/fft/CMakeLists.txt
@@ -11,7 +11,7 @@ if(CONFIG_MATH_32BIT_FFT)
 endif()
 
 if(CONFIG_MATH_FFT_MULTI)
-  list(APPEND base_files fft_multi.c)
+  list(APPEND base_files fft_multi.c fft_multi_generic.c fft_multi_hifi3.c)
 endif()
 
 is_zephyr(zephyr)
diff --git a/src/math/fft/fft_32_hifi3.c b/src/math/fft/fft_32_hifi3.c
index 1ca88c48268d..537c576468fc 100644
--- a/src/math/fft/fft_32_hifi3.c
+++ b/src/math/fft/fft_32_hifi3.c
@@ -19,13 +19,16 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 	ae_int32x2 sample;
 	ae_int32x2 sample1;
 	ae_int32x2 sample2;
+	ae_int32x2 tw;
 	ae_int32x2 *inx = (ae_int32x2 *)plan->inb32;
 	ae_int32x2 *outx = (ae_int32x2 *)plan->outb32;
-	ae_int32x2 *outtop;
-	ae_int32x2 *outbottom;
+	ae_int32x2 *top_ptr;
+	ae_int32x2 *bot_ptr;
 	uint16_t *idx = &plan->bit_reverse_idx[0];
-	int depth, top, bottom, index;
-	int i, j, k, m, n;
+	const int32_t *tw_r;
+	const int32_t *tw_i;
+	int depth, i;
+	int j, k, m, n;
 	int size = plan->size;
 	int len = plan->len;
 
@@ -35,7 +38,6 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 	if (!plan->inb32 || !plan->outb32)
 		return;
 
-	/* convert to complex conjugate for ifft */
 	/* step 1: re-arrange input in bit reverse order, and shrink the level to avoid overflow */
 	if (ifft) {
 		/* convert to complex conjugate for ifft */
@@ -54,43 +56,82 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 		}
 	}
 
-	/* step 2: loop to do FFT transform in smaller size */
-	for (depth = 1; depth <= len; ++depth) {
+	/*
+	 * Step 2a: First FFT stage (depth=1, m=2, n=1).
+	 * All butterflies use twiddle factor W^0 = 1+0j,
+	 * so the complex multiply is skipped entirely.
+	 */
+	top_ptr = outx;
+	bot_ptr = outx + 1;
+	for (k = 0; k < size; k += 2) {
+		sample1 = AE_L32X2_I(top_ptr, 0);
+		sample2 = AE_L32X2_I(bot_ptr, 0);
+		sample = AE_ADD32S(sample1, sample2);
+		AE_S32X2_I(sample, top_ptr, 0);
+		sample = AE_SUB32S(sample1, sample2);
+		AE_S32X2_I(sample, bot_ptr, 0);
+		top_ptr += 2;
+		bot_ptr += 2;
+	}
+
+	/* Step 2b: Remaining FFT stages (depth >= 2) */
+	for (depth = 2; depth <= len; ++depth) {
 		m = 1 << depth;
 		n = m >> 1;
 		i = FFT_SIZE_MAX >> depth;
 
+		top_ptr = outx;
+		bot_ptr = outx + n;
+
 		/* doing FFT transforms in size m */
 		for (k = 0; k < size; k += m) {
-			/* doing one FFT transform for size m */
-			for (j = 0; j < n; ++j) {
-				index = i * j;
-				top = k + j;
-				bottom = top + n;
+			/*
+			 * j=0: twiddle factor W^0 = 1+0j,
+			 * butterfly without complex multiply.
+			 */
+			sample1 = AE_L32X2_I(top_ptr, 0);
+			sample = AE_L32X2_I(bot_ptr, 0);
+			sample2 = AE_ADD32S(sample1, sample);
+			AE_S32X2_I(sample2, top_ptr, 0);
+			sample2 = AE_SUB32S(sample1, sample);
+			AE_S32X2_I(sample2, bot_ptr, 0);
+			top_ptr++;
+			bot_ptr++;
 
-				/* load twiddle factor to sample1 */
-				sample1 = twiddle_real_32[index];
-				sample2 = twiddle_imag_32[index];
-				sample1 = AE_SEL32_LH(sample1, sample2);
+			/* j=1..n-1: full butterfly with twiddle multiply */
+			tw_r = &twiddle_real_32[i];
+			tw_i = &twiddle_imag_32[i];
+			for (j = 1; j < n; ++j) {
+				/* load and combine twiddle factor {real, imag} */
+				sample1 = tw_r[0];
+				sample2 = tw_i[0];
+				tw = AE_SEL32_LH(sample1, sample2);
 
 				/* calculate the accumulator: twiddle * bottom */
-				sample2 = outx[bottom];
-				res = AE_MULF32S_HH(sample1, sample2);
-				AE_MULSF32S_LL(res, sample1, sample2);
-				res1 = AE_MULF32S_HL(sample1, sample2);
-				AE_MULAF32S_LH(res1, sample1, sample2);
+				sample2 = AE_L32X2_I(bot_ptr, 0);
+				res = AE_MULF32S_HH(tw, sample2);
+				AE_MULSF32S_LL(res, tw, sample2);
+				res1 = AE_MULF32S_HL(tw, sample2);
+				AE_MULAF32S_LH(res1, tw, sample2);
 				sample = AE_ROUND32X2F64SSYM(res, res1);
-				sample1 = outx[top];
+				sample1 = AE_L32X2_I(top_ptr, 0);
+
 				/* calculate the top output: top = top + accumulate */
 				sample2 = AE_ADD32S(sample1, sample);
-				outtop = outx + top;
-				AE_S32X2_I(sample2, outtop, 0);
+				AE_S32X2_I(sample2, top_ptr, 0);
 
 				/* calculate the bottom output: bottom = top - accumulate */
 				sample2 = AE_SUB32S(sample1, sample);
-				outbottom = outx + bottom;
-				AE_S32X2_I(sample2, outbottom, 0);
+				AE_S32X2_I(sample2, bot_ptr, 0);
+
+				top_ptr++;
+				bot_ptr++;
+				tw_r += i;
+				tw_i += i;
 			}
+			/* advance pointers past current group's bottom half */
+			top_ptr += n;
+			bot_ptr += n;
 		}
 	}
 
diff --git a/src/math/fft/fft_multi.c b/src/math/fft/fft_multi.c
index 2b01f1632478..b16decca012d 100644
--- a/src/math/fft/fft_multi.c
+++ b/src/math/fft/fft_multi.c
@@ -19,11 +19,6 @@ LOG_MODULE_REGISTER(math_fft_multi, CONFIG_SOF_LOG_LEVEL);
 SOF_DEFINE_REG_UUID(math_fft_multi);
 DECLARE_TR_CTX(math_fft_multi_tr, SOF_UUID(math_fft_multi_uuid), LOG_LEVEL_INFO);
 
-/* Constants for size 3 DFT */
-#define DFT3_COEFR	-1073741824	/* int32(-0.5 * 2^31) */
-#define DFT3_COEFI	1859775393	/* int32(sqrt(3) / 2 * 2^31) */
-#define DFT3_SCALE	715827883	/* int32(1/3*2^31) */
-
 struct fft_multi_plan *mod_fft_multi_plan_new(struct processing_module *mod, void *inb,
 					      void *outb, uint32_t size, int bits)
 {
@@ -143,152 +138,3 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
 	mod_free(mod, plan->bit_reverse_idx);
 	mod_free(mod, plan);
 }
-
-void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
-{
-	const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
-	const struct icomplex32 c1 = {DFT3_COEFR,  DFT3_COEFI};
-	struct icomplex32 x[3];
-	struct icomplex32 p1, p2, sum;
-	int i;
-
-	for (i = 0; i < 3; i++) {
-		x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
-		x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
-	}
-
-	/*
-	 *      | 1   1   1 |
-	 * c =  | 1  c0  c1 | , x = [ x0 x1 x2 ]
-	 *      | 1  c1  c0 |
-	 *
-	 * y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(0)
-	 * y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(0)
-	 * y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(0)
-	 */
-
-	/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
-	icomplex32_adds(&x[0], &x[1], &sum);
-	icomplex32_adds(&x[2], &sum, &y[0]);
-
-	/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
-	icomplex32_mul(&c0, &x[1], &p1);
-	icomplex32_mul(&c1, &x[2], &p2);
-	icomplex32_adds(&p1, &p2, &sum);
-	icomplex32_adds(&x[0], &sum, &y[1]);
-
-	/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
-	icomplex32_mul(&c1, &x[1], &p1);
-	icomplex32_mul(&c0, &x[2], &p2);
-	icomplex32_adds(&p1, &p2, &sum);
-	icomplex32_adds(&x[0], &sum, &y[2]);
-}
-
-void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
-{
-	struct icomplex32 x[FFT_MULTI_COUNT_MAX];
-	struct icomplex32 y[FFT_MULTI_COUNT_MAX];
-	struct icomplex32 t, c;
-	int i, j, k, m;
-
-	/* Handle 2^N FFT */
-	if (plan->num_ffts == 1) {
-		memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
-		fft_execute_32(plan->fft_plan[0], ifft);
-		return;
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
-	FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
-	FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
-	FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
-#endif
-
-	/* convert to complex conjugate for IFFT */
-	if (ifft) {
-		for (i = 0; i < plan->total_size; i++)
-			icomplex32_conj(&plan->inb32[i]);
-	}
-
-	/* Copy input buffers */
-	k = 0;
-	for (i = 0; i < plan->fft_size; i++)
-		for (j = 0; j < plan->num_ffts; j++)
-			plan->tmp_i32[j][i] = plan->inb32[k++];
-
-	/* Clear output buffers and call individual FFTs*/
-	for (j = 0; j < plan->num_ffts; j++) {
-		memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
-		fft_execute_32(plan->fft_plan[j], 0);
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	for (j = 0; j < plan->num_ffts; j++)
-		for (i = 0; i < plan->fft_size; i++)
-			fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
-#endif
-
-	/* Multiply with twiddle factors */
-	m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
-	for (j = 1; j < plan->num_ffts; j++) {
-		for (i = 0; i < plan->fft_size; i++) {
-			c = plan->tmp_o32[j][i];
-			k = j * i * m;
-			t.real = multi_twiddle_real_32[k];
-			t.imag = multi_twiddle_imag_32[k];
-			//fprintf(fh3, "%d %d\n", t.real, t.imag);
-			icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
-		}
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	for (j = 0; j < plan->num_ffts; j++)
-		for (i = 0; i < plan->fft_size; i++)
-			fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
-#endif
-
-	/* DFT of size 3 */
-	j = plan->fft_size;
-	k = 2 * plan->fft_size;
-	for (i = 0; i < plan->fft_size; i++) {
-		x[0] = plan->tmp_o32[0][i];
-		x[1] = plan->tmp_o32[1][i];
-		x[2] = plan->tmp_o32[2][i];
-		dft3_32(x, y);
-		plan->outb32[i] = y[0];
-		plan->outb32[i + j] = y[1];
-		plan->outb32[i + k] = y[2];
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	for (i = 0; i < plan->total_size; i++)
-		fprintf(fh4, "%d %d\n",  plan->outb32[i].real, plan->outb32[i].imag);
-#endif
-
-	/* shift back for IFFT */
-
-	/* TODO: Check if time shift method for IFFT is more efficient or more accurate
-	 * tmp = 1 / N * fft(X);
-	 * x = tmp([1 N:-1:2])
-	 */
-	if (ifft) {
-		/*
-		 * no need to divide N as it is already done in the input side
-		 * for Q1.31 format. Instead, we need to multiply N to compensate
-		 * the shrink we did in the FFT transform.
-		 */
-		for (i = 0; i < plan->total_size; i++) {
-			/* Need to negate imag part to match reference */
-			plan->outb32[i].imag = -plan->outb32[i].imag;
-			icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
-					 &plan->outb32[i]);
-			plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
-			plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
-		}
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	fclose(fh1); fclose(fh2); fclose(fh3); fclose(fh4);
-#endif
-}
diff --git a/src/math/fft/fft_multi_generic.c b/src/math/fft/fft_multi_generic.c
new file mode 100644
index 000000000000..dd30368bbdbb
--- /dev/null
+++ b/src/math/fft/fft_multi_generic.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2025-2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+#include <sof/audio/format.h>
+#include <sof/math/icomplex32.h>
+#include <sof/common.h>
+#include <sof/math/fft.h>
+
+/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */
+#define FFT_MULTI_TWIDDLE_SIZE 2048
+extern const int32_t multi_twiddle_real_32[];
+extern const int32_t multi_twiddle_imag_32[];
+
+/* Constants for size 3 DFT */
+#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
+#define DFT3_COEFI 1859775393  /* int32(sqrt(3) / 2 * 2^31) */
+#define DFT3_SCALE 715827883   /* int32(1/3*2^31) */
+
+#ifdef FFT_GENERIC
+
+void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
+{
+	const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
+	const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
+	struct icomplex32 x[3];
+	struct icomplex32 p1, p2, sum;
+	int i;
+
+	for (i = 0; i < 3; i++) {
+		x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
+		x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
+	}
+
+	/*
+	 *      | 1   1   1 |
+	 * c =  | 1  c0  c1 | , x = [ x0 x1 x2 ]
+	 *      | 1  c1  c0 |
+	 *
+	 * y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(2)
+	 * y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(2)
+	 * y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(2)
+	 */
+
+	/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
+	icomplex32_adds(&x[0], &x[1], &sum);
+	icomplex32_adds(&x[2], &sum, &y[0]);
+
+	/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
+	icomplex32_mul(&c0, &x[1], &p1);
+	icomplex32_mul(&c1, &x[2], &p2);
+	icomplex32_adds(&p1, &p2, &sum);
+	icomplex32_adds(&x[0], &sum, &y[1]);
+
+	/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
+	icomplex32_mul(&c1, &x[1], &p1);
+	icomplex32_mul(&c0, &x[2], &p2);
+	icomplex32_adds(&p1, &p2, &sum);
+	icomplex32_adds(&x[0], &sum, &y[2]);
+}
+
+void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
+{
+	struct icomplex32 x[FFT_MULTI_COUNT_MAX];
+	struct icomplex32 y[FFT_MULTI_COUNT_MAX];
+	struct icomplex32 t, c;
+	int i, j, k, m;
+
+	/* Handle 2^N FFT */
+	if (plan->num_ffts == 1) {
+		memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
+		fft_execute_32(plan->fft_plan[0], ifft);
+		return;
+	}
+
+#ifdef DEBUG_DUMP_TO_FILE
+	FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
+	FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
+	FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
+	FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
+#endif
+
+	/* convert to complex conjugate for IFFT */
+	if (ifft) {
+		for (i = 0; i < plan->total_size; i++)
+			icomplex32_conj(&plan->inb32[i]);
+	}
+
+	/* Copy input buffers */
+	k = 0;
+	for (i = 0; i < plan->fft_size; i++)
+		for (j = 0; j < plan->num_ffts; j++)
+			plan->tmp_i32[j][i] = plan->inb32[k++];
+
+	/* Clear output buffers and call individual FFTs*/
+	for (j = 0; j < plan->num_ffts; j++) {
+		memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
+		fft_execute_32(plan->fft_plan[j], 0);
+	}
+
+#ifdef DEBUG_DUMP_TO_FILE
+	for (j = 0; j < plan->num_ffts; j++)
+		for (i = 0; i < plan->fft_size; i++)
+			fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
+#endif
+
+	/* Multiply with twiddle factors */
+	m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
+	for (j = 1; j < plan->num_ffts; j++) {
+		for (i = 0; i < plan->fft_size; i++) {
+			c = plan->tmp_o32[j][i];
+			k = j * i * m;
+			t.real = multi_twiddle_real_32[k];
+			t.imag = multi_twiddle_imag_32[k];
+			// fprintf(fh3, "%d %d\n", t.real, t.imag);
+			icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
+		}
+	}
+
+#ifdef DEBUG_DUMP_TO_FILE
+	for (j = 0; j < plan->num_ffts; j++)
+		for (i = 0; i < plan->fft_size; i++)
+			fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
+#endif
+
+	/* DFT of size 3 */
+	j = plan->fft_size;
+	k = 2 * plan->fft_size;
+	for (i = 0; i < plan->fft_size; i++) {
+		x[0] = plan->tmp_o32[0][i];
+		x[1] = plan->tmp_o32[1][i];
+		x[2] = plan->tmp_o32[2][i];
+		dft3_32(x, y);
+		plan->outb32[i] = y[0];
+		plan->outb32[i + j] = y[1];
+		plan->outb32[i + k] = y[2];
+	}
+
+#ifdef DEBUG_DUMP_TO_FILE
+	for (i = 0; i < plan->total_size; i++)
+		fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
+#endif
+
+	/* shift back for IFFT */
+
+	/* TODO: Check if time shift method for IFFT is more efficient or more accurate
+	 * tmp = 1 / N * fft(X);
+	 * x = tmp([1 N:-1:2])
+	 */
+	if (ifft) {
+		/*
+		 * no need to divide N as it is already done in the input side
+		 * for Q1.31 format. Instead, we need to multiply N to compensate
+		 * the shrink we did in the FFT transform.
+		 */
+		for (i = 0; i < plan->total_size; i++) {
+			/* Need to negate imag part to match reference */
+			plan->outb32[i].imag = -plan->outb32[i].imag;
+			icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
+					 &plan->outb32[i]);
+			plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
+			plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
+		}
+	}
+
+#ifdef DEBUG_DUMP_TO_FILE
+	fclose(fh1);
+	fclose(fh2);
+	fclose(fh3);
+	fclose(fh4);
+#endif
+}
+
+#endif /* FFT_GENERIC */
diff --git a/src/math/fft/fft_multi_hifi3.c b/src/math/fft/fft_multi_hifi3.c
new file mode 100644
index 000000000000..bd1966168369
--- /dev/null
+++ b/src/math/fft/fft_multi_hifi3.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2025-2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * @file fft_multi_hifi3.c
+ * @brief HiFi3 optimized multi-radix FFT functions.
+ *
+ * This file provides HiFi3 optimized implementations of dft3_32() and
+ * fft_multi_execute_32() using Xtensa HiFi3 intrinsics. These replace
+ * the generic versions when HiFi3 or HiFi4 hardware is available.
+ *
+ * Key optimizations over the generic versions:
+ * - Packed {real, imag} processing using ae_int32x2 registers
+ * - 64-bit MAC accumulation for fused complex multiply-add
+ * - Saturating 32-bit arithmetic via AE_ADD32S/AE_SLAA32S
+ * - Vectorized conjugate, shift, and scale in the IFFT path
+ */
+
+#include <sof/audio/module_adapter/module/generic.h>
+#include <sof/audio/format.h>
+#include <sof/math/icomplex32.h>
+#include <sof/common.h>
+#include <sof/math/fft.h>
+
+/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */
+#define FFT_MULTI_TWIDDLE_SIZE 2048
+extern const int32_t multi_twiddle_real_32[];
+extern const int32_t multi_twiddle_imag_32[];
+
+#ifdef FFT_HIFI3
+
+#include <xtensa/tie/xt_hifi3.h>
+
+/** @brief Q1.31 constant -0.5 */
+#define DFT3_COEFR -1073741824
+/** @brief Q1.31 constant sqrt(3)/2 */
+#define DFT3_COEFI 1859775393
+/** @brief Q1.31 constant 1/3 */
+#define DFT3_SCALE 715827883
+
+/**
+ * dft3_32() - Compute 3-point DFT of Q1.31 complex data (HiFi3).
+ * @param x_in Pointer to 3 input complex samples in Q1.31.
+ * @param y Pointer to 3 output complex samples in Q1.31.
+ *
+ * Computes the DFT matrix-vector product:
+ *
+ *      | 1   1   1 |
+ * Y =  | 1  c0  c1 | * X / 3
+ *      | 1  c1  c0 |
+ *
+ * where c0 = exp(-j*2*pi/3) and c1 = exp(+j*2*pi/3).
+ * Input is prescaled by 1/3 to prevent overflow.
+ *
+ * The two complex multiplies for each output bin are fused
+ * into a single 64-bit accumulator pair, avoiding intermediate
+ * rounding and saving instructions.
+ */
+void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
+{
+	ae_int32x2 *p_in = (ae_int32x2 *)x_in;
+	ae_int32x2 *p_out = (ae_int32x2 *)y;
+	ae_int32x2 x0, x1, x2;
+	ae_int32x2 c0, c1;
+	ae_int32x2 scale;
+	ae_int32x2 sum, result;
+	ae_int64 re, im;
+
+	/*
+	 * Set up DFT3 twiddle factors as packed {H=real, L=imag}.
+	 * c0 = {-0.5, -sqrt(3)/2}
+	 * c1 = {-0.5, +sqrt(3)/2}
+	 */
+	scale = AE_MOVDA32(DFT3_COEFR);
+	c0 = AE_SEL32_LH(scale, AE_MOVDA32(-DFT3_COEFI));
+	c1 = AE_SEL32_LH(scale, AE_MOVDA32(DFT3_COEFI));
+
+	/* Scale factor 1/3, broadcast to both H and L */
+	scale = AE_MOVDA32(DFT3_SCALE);
+
+	/* Load input samples as packed {real, imag} */
+	x0 = AE_L32X2_I(p_in, 0);
+	x1 = AE_L32X2_I(p_in, 1 * sizeof(ae_int32x2));
+	x2 = AE_L32X2_I(p_in, 2 * sizeof(ae_int32x2));
+
+	/* Scale all inputs by 1/3 to prevent overflow */
+	x0 = AE_MULFP32X2RS(x0, scale);
+	x1 = AE_MULFP32X2RS(x1, scale);
+	x2 = AE_MULFP32X2RS(x2, scale);
+
+	/* y[0] = x[0] + x[1] + x[2] */
+	sum = AE_ADD32S(x0, x1);
+	AE_S32X2_I(AE_ADD32S(sum, x2), p_out, 0);
+
+	/*
+	 * y[1] = x[0] + c0 * x[1] + c1 * x[2]
+	 *
+	 * Fuse two complex multiplies and their sum into a single
+	 * 64-bit accumulator pair to avoid intermediate rounding.
+	 *
+	 * Real part: c0.re*x1.re - c0.im*x1.im + c1.re*x2.re - c1.im*x2.im
+	 * Imag part: c0.re*x1.im + c0.im*x1.re + c1.re*x2.im + c1.im*x2.re
+	 */
+	re = AE_MULF32S_HH(c0, x1); /* c0.re * x1.re */
+	AE_MULSF32S_LL(re, c0, x1); /* -= c0.im * x1.im */
+	AE_MULAF32S_HH(re, c1, x2); /* += c1.re * x2.re */
+	AE_MULSF32S_LL(re, c1, x2); /* -= c1.im * x2.im */
+
+	im = AE_MULF32S_HL(c0, x1); /* c0.re * x1.im */
+	AE_MULAF32S_LH(im, c0, x1); /* += c0.im * x1.re */
+	AE_MULAF32S_HL(im, c1, x2); /* += c1.re * x2.im */
+	AE_MULAF32S_LH(im, c1, x2); /* += c1.im * x2.re */
+
+	result = AE_ROUND32X2F64SSYM(re, im);
+	AE_S32X2_I(AE_ADD32S(x0, result), p_out, sizeof(ae_int32x2));
+
+	/*
+	 * y[2] = x[0] + c1 * x[1] + c0 * x[2]
+	 *
+	 * Same structure as y[1] but with swapped coefficients.
+	 */
+	re = AE_MULF32S_HH(c1, x1); /* c1.re * x1.re */
+	AE_MULSF32S_LL(re, c1, x1); /* -= c1.im * x1.im */
+	AE_MULAF32S_HH(re, c0, x2); /* += c0.re * x2.re */
+	AE_MULSF32S_LL(re, c0, x2); /* -= c0.im * x2.im */
+
+	im = AE_MULF32S_HL(c1, x1); /* c1.re * x1.im */
+	AE_MULAF32S_LH(im, c1, x1); /* += c1.im * x1.re */
+	AE_MULAF32S_HL(im, c0, x2); /* += c0.re * x2.im */
+	AE_MULAF32S_LH(im, c0, x2); /* += c0.im * x2.re */
+
+	result = AE_ROUND32X2F64SSYM(re, im);
+	AE_S32X2_I(AE_ADD32S(x0, result), p_out, 2 * sizeof(ae_int32x2));
+}
+
+/**
+ * fft_multi_execute_32() - Execute multi-radix FFT/IFFT (HiFi3).
+ * @param plan Pointer to multi-FFT plan.
+ * @param ifft False for FFT, true for IFFT.
+ *
+ * Performs a composite FFT of size N = num_ffts * fft_size (e.g. 3 * 512 = 1536).
+ * For power-of-two sizes the call is forwarded to fft_execute_32().
+ *
+ * HiFi3 optimizations vs. the generic path:
+ * - IFFT conjugate uses packed AE_SEL32_HL / AE_NEG32S
+ * - Twiddle multiply uses fused 64-bit MAC (no intermediate rounding)
+ * - IFFT shift-back combines negate, shift, and *3 scale in HiFi3 ops
+ */
+void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
+{
+	struct icomplex32 x[FFT_MULTI_COUNT_MAX];
+	struct icomplex32 y[FFT_MULTI_COUNT_MAX];
+	ae_int32x2 *p_src;
+	ae_int32x2 *p_dst;
+	ae_int32x2 sample, sample_neg, tw, data;
+	ae_int64 re, im;
+	int i, j, k, m;
+
+	/* Handle 2^N FFT */
+	if (plan->num_ffts == 1) {
+		memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
+		fft_execute_32(plan->fft_plan[0], ifft);
+		return;
+	}
+
+	/* Convert to complex conjugate for IFFT */
+	if (ifft) {
+		p_src = (ae_int32x2 *)plan->inb32;
+		for (i = 0; i < plan->total_size; i++) {
+			AE_L32X2_IP(sample, p_src, 0);
+			sample_neg = AE_NEG32S(sample);
+			sample = AE_SEL32_HL(sample, sample_neg);
+			AE_S32X2_IP(sample, p_src, sizeof(ae_int32x2));
+		}
+	}
+
+	/* Copy input buffers (interleaved -> per-FFT) */
+	k = 0;
+	for (i = 0; i < plan->fft_size; i++)
+		for (j = 0; j < plan->num_ffts; j++)
+			plan->tmp_i32[j][i] = plan->inb32[k++];
+
+	/* Clear output buffers and call individual FFTs */
+	for (j = 0; j < plan->num_ffts; j++) {
+		memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
+		fft_execute_32(plan->fft_plan[j], 0);
+	}
+
+	/* Multiply with twiddle factors using HiFi3 complex multiply */
+	m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
+	for (j = 1; j < plan->num_ffts; j++) {
+		p_dst = (ae_int32x2 *)plan->tmp_o32[j];
+		for (i = 0; i < plan->fft_size; i++) {
+			k = j * i * m;
+			/*
+			 * Build twiddle as packed {H=real, L=imag}.
+			 * Use AE_SEL32_LH to pack two scalar loads.
+			 */
+			tw = AE_SEL32_LH(AE_MOVDA32(multi_twiddle_real_32[k]),
+					 AE_MOVDA32(multi_twiddle_imag_32[k]));
+
+			data = AE_L32X2_I(p_dst, 0);
+
+			/* Complex multiply: tw * data */
+			re = AE_MULF32S_HH(tw, data);
+			AE_MULSF32S_LL(re, tw, data);
+			im = AE_MULF32S_HL(tw, data);
+			AE_MULAF32S_LH(im, tw, data);
+
+			AE_S32X2_IP(AE_ROUND32X2F64SSYM(re, im),
+				    p_dst, sizeof(ae_int32x2));
+		}
+	}
+
+	/* DFT of size 3 */
+	j = plan->fft_size;
+	k = 2 * plan->fft_size;
+	for (i = 0; i < plan->fft_size; i++) {
+		x[0] = plan->tmp_o32[0][i];
+		x[1] = plan->tmp_o32[1][i];
+		x[2] = plan->tmp_o32[2][i];
+		dft3_32(x, y);
+		plan->outb32[i] = y[0];
+		plan->outb32[i + j] = y[1];
+		plan->outb32[i + k] = y[2];
+	}
+
+	/* Shift back for IFFT */
+	if (ifft) {
+		int len = plan->fft_plan[0]->len;
+
+		p_dst = (ae_int32x2 *)plan->outb32;
+		for (i = 0; i < plan->total_size; i++) {
+			AE_L32X2_IP(sample, p_dst, 0);
+
+			/* Negate imag part to match reference */
+			sample_neg = AE_NEG32S(sample);
+			sample = AE_SEL32_HL(sample, sample_neg);
+
+			/* Shift left by FFT length to compensate shrink */
+			sample = AE_SLAA32S(sample, len);
+
+			/* Integer multiply by 3 (num_ffts) with saturation */
+			data = AE_ADD32S(sample, sample);
+			sample = AE_ADD32S(data, sample);
+
+			AE_S32X2_IP(sample, p_dst, sizeof(ae_int32x2));
+		}
+	}
+}
+
+#endif /* FFT_HIFI3 */
diff --git a/test/cmocka/src/math/fft/CMakeLists.txt b/test/cmocka/src/math/fft/CMakeLists.txt
index 326767ec570e..97ac2ee8438f 100644
--- a/test/cmocka/src/math/fft/CMakeLists.txt
+++ b/test/cmocka/src/math/fft/CMakeLists.txt
@@ -32,6 +32,8 @@ cmocka_test(fft
 cmocka_test(dft3
 	dft3.c
 	${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi.c
+	${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_generic.c
+	${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_hifi3.c
 	${PROJECT_SOURCE_DIR}/src/math/fft/fft_32.c
 	${PROJECT_SOURCE_DIR}/src/math/fft/fft_common.c
 	${PROJECT_SOURCE_DIR}/test/cmocka/src/notifier_mocks.c
@@ -41,6 +43,8 @@ cmocka_test(dft3
 cmocka_test(fft_multi
 	fft_multi.c
 	${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi.c
+	${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_generic.c
+	${PROJECT_SOURCE_DIR}/src/math/fft/fft_multi_hifi3.c
 	${PROJECT_SOURCE_DIR}/src/math/fft/fft_common.c
 	${PROJECT_SOURCE_DIR}/src/math/fft/fft_32.c
 	${PROJECT_SOURCE_DIR}/test/cmocka/src/notifier_mocks.c