From fbf60587765d16070826abbb470dc0f01f827979 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 17 Mar 2026 20:00:41 +0200 Subject: [PATCH 1/2] Audio: STFT Process: Add Xtensa HiFi function versions This patch adds to stft_process-hifi3.c the HiFi3 versions of higher complexity functions stft_process_apply_window() and stft_process_overlap_add_ifft_buffer(). The functions with no clear HiFi optimization benefit are moved from stft_process-generic.c to stft_process_common.c. Those functions move data with practically no processing to samples. The stft_process_setup() function is changed to allocate buffers with mod_balloc_align() to ensure a 32-bit sample pair or complex number is aligned for 64 bit xtensa SIMD. This patch also adds checks to other parameters to ensure the STFT is set up in a way that can be executed. The patch also fixes a too large allocation in setup. The window function buffer allocation is common for all channels. It should not be multiplied by channels count. This change saves 17 MCPS (from 63 MCPS to 46 MCPS). The test was done with script run: scripts/rebuild-testbench.sh -p mtl scripts/sof-testbench-helper.sh -x -m stft_process_1024_256_ \ -p profile-stft_process.txt The above STFT used FFT length 1024 with hop 256. Signed-off-by: Seppo Ingalsuo --- src/audio/stft_process/CMakeLists.txt | 1 + src/audio/stft_process/Kconfig | 2 + src/audio/stft_process/Kconfig.simd | 31 ++ src/audio/stft_process/stft_process-generic.c | 396 +----------------- src/audio/stft_process/stft_process-hifi3.c | 146 +++++++ src/audio/stft_process/stft_process.h | 106 ++++- src/audio/stft_process/stft_process_common.c | 312 ++++++++++++++ src/audio/stft_process/stft_process_setup.c | 32 +- 8 files changed, 628 insertions(+), 398 deletions(-) create mode 100644 src/audio/stft_process/Kconfig.simd create mode 100644 src/audio/stft_process/stft_process-hifi3.c diff --git a/src/audio/stft_process/CMakeLists.txt b/src/audio/stft_process/CMakeLists.txt index 66ccb2276c58..e3a13e17bb93 100644 --- a/src/audio/stft_process/CMakeLists.txt +++ b/src/audio/stft_process/CMakeLists.txt @@ -8,6 +8,7 @@ else() add_local_sources(sof stft_process_setup.c) add_local_sources(sof stft_process_common.c) add_local_sources(sof stft_process-generic.c) + add_local_sources(sof stft_process-hifi3.c) if(CONFIG_IPC_MAJOR_4) add_local_sources(sof stft_process-ipc4.c) diff --git a/src/audio/stft_process/Kconfig b/src/audio/stft_process/Kconfig index b73bdebe3bb4..426d272477a9 100644 --- a/src/audio/stft_process/Kconfig +++ b/src/audio/stft_process/Kconfig @@ -19,6 +19,8 @@ config COMP_STFT_PROCESS if COMP_STFT_PROCESS +rsource "Kconfig.simd" + config STFT_PROCESS_MAGNITUDE_PHASE bool "Convert FFTs to polar magnitude and phase" default n diff --git a/src/audio/stft_process/Kconfig.simd b/src/audio/stft_process/Kconfig.simd new file mode 100644 index 000000000000..0b981df583fa --- /dev/null +++ b/src/audio/stft_process/Kconfig.simd @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause + +comment "STFT Process optimization level select" + +choice "COMP_STFT_PROCESS_SIMD_LEVEL_SELECT" + prompt "Choose which SIMD level is used for the STFT Process module" + depends on COMP_STFT_PROCESS + default COMP_STFT_PROCESS_HIFI_MAX + + config COMP_STFT_PROCESS_HIFI_MAX + prompt "SIMD will be selected by toolchain pre-defined header" + bool + help + When this is selected, the optimization level will be + determined by the toolchain pre-defined macros in the + core isa header file. + + config COMP_STFT_PROCESS_HIFI_3 + prompt "Choose HIFI3 intrinsic optimized STFT Process module" + bool + help + This option is used to build HIFI3 intrinsic optimized + STFT Process code. + + config COMP_STFT_PROCESS_HIFI_NONE + prompt "Choose generic C STFT Process module, no HIFI SIMD involved" + bool + help + This option is used to build STFT Process + with generic C code. +endchoice diff --git a/src/audio/stft_process/stft_process-generic.c b/src/audio/stft_process/stft_process-generic.c index 3399c24657a4..6ade95bb6ae8 100644 --- a/src/audio/stft_process/stft_process-generic.c +++ b/src/audio/stft_process/stft_process-generic.c @@ -1,396 +1,14 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2025 Intel Corporation. +// Copyright(c) 2025-2026 Intel Corporation. -#include -#include #include -#include -#include -#include +#include +#include #include #include "stft_process.h" -#if CONFIG_FORMAT_S32LE -/** - * stft_process_source_s32() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_source_s32(struct stft_comp_data *cd, struct sof_source *source, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *ibuf; - int32_t const *x, *x_start, *x_end; - int x_size; - int bytes = frames * cd->frame_bytes; - int frames_left = frames; - int ret; - int n1; - int n2; - int channels = cd->channels; - int n; - int i; - int j; - - /* Get pointer to source data in circular buffer */ - ret = source_get_data_s32(source, bytes, &x, &x_start, &x_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - x_end = x_start + x_size; - - while (frames_left) { - /* Find out samples to process before first wrap or end of data. */ - ibuf = &state->ibuf[0]; - n1 = (x_end - x) / cd->channels; - n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); - n = MIN(n1, n2); - n = MIN(n, frames_left); - for (i = 0; i < n; i++) { - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - *ibuf->w_ptr++ = *x++; - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); - } - - if (x >= x_end) - x -= x_size; - - /* Update processed samples count for next loop iteration. */ - frames_left -= n; - } - - /* Update the source for bytes consumed. Return success. */ - source_release_data(source, bytes); - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->s_avail += frames; - ibuf->s_free -= frames; - } - - return 0; -} - -/** - * stft_process_sink_s32() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_sink_s32(struct stft_comp_data *cd, struct sof_sink *sink, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *obuf; - int32_t *y, *y_start, *y_end; - int frames_remain = frames; - int channels = cd->channels; - int bytes = frames * cd->frame_bytes; - int y_size; - int ret; - int ch, n1, n, i; - - /* Get pointer to sink data in circular buffer */ - ret = sink_get_buffer_s32(sink, bytes, &y, &y_start, &y_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - y_end = y_start + y_size; - while (frames_remain) { - /* Find out samples to process before first wrap or end of data. */ - obuf = &state->obuf[0]; - n1 = (y_end - y) / cd->channels; - n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); - n = MIN(n1, n); - n = MIN(n, frames_remain); - - for (i = 0; i < n; i++) { - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - *y++ = *obuf->r_ptr; - *obuf->r_ptr++ = 0; /* clear overlap add mix */ - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (ch = 0; ch < cd->channels; ch++) { - obuf = &state->obuf[ch]; - obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); - } - - if (y >= y_end) - y -= y_size; - - /* Update processed samples count for next loop iteration. */ - frames_remain -= n; - } - - /* Update the sink for bytes produced. Return success. */ - sink_commit_buffer(sink, bytes); - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - obuf->s_avail -= frames; - obuf->s_free += frames; - } - - return 0; -} -#endif /* CONFIG_FORMAT_S32LE */ - -#if CONFIG_FORMAT_S16LE -/** - * stft_process_source_s16() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_source_s16(struct stft_comp_data *cd, struct sof_source *source, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *ibuf; - int16_t const *x, *x_start, *x_end; - int16_t in; - int x_size; - int channels = cd->channels; - int bytes = frames * cd->frame_bytes; - int frames_left = frames; - int ret; - int n1; - int n2; - int n; - int i; - int j; - - /* Get pointer to source data in circular buffer, get buffer start and size to - * check for wrap. The size in bytes is converted to number of s16 samples to - * control the samples process loop. If the number of bytes requested is not - * possible, an error is returned. - */ - ret = source_get_data_s16(source, bytes, &x, &x_start, &x_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - x_end = x_start + x_size; - - while (frames_left) { - /* Find out samples to process before first wrap or end of data. */ - ibuf = &state->ibuf[0]; - n1 = (x_end - x) / cd->channels; - n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); - n = MIN(n1, n2); - n = MIN(n, frames_left); - for (i = 0; i < n; i++) { - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - in = *x++; - *ibuf->w_ptr++ = (int32_t)in << 16; - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); - } - - if (x >= x_end) - x -= x_size; - - /* Update processed samples count for next loop iteration. */ - frames_left -= n; - } - - /* Update the source for bytes consumed. Return success. */ - source_release_data(source, bytes); - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->s_avail += frames; - ibuf->s_free -= frames; - } - return 0; -} - -/** - * stft_process_sink_s16() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_sink_s16(struct stft_comp_data *cd, struct sof_sink *sink, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *obuf; - int16_t *y, *y_start, *y_end; - int frames_remain = frames; - int channels = cd->channels; - int bytes = frames * cd->frame_bytes; - int y_size; - int ret; - int ch, n1, n, i; - - /* Get pointer to sink data in circular buffer */ - ret = sink_get_buffer_s16(sink, bytes, &y, &y_start, &y_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - y_end = y_start + y_size; - while (frames_remain) { - /* Find out samples to process before first wrap or end of data. */ - obuf = &state->obuf[0]; - n1 = (y_end - y) / cd->channels; - n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); - n = MIN(n1, n); - n = MIN(n, frames_remain); - - for (i = 0; i < n; i++) { - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - *y++ = sat_int16(Q_SHIFT_RND(*obuf->r_ptr, 31, 15)); - *obuf->r_ptr++ = 0; /* clear overlap add mix */ - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); - } - - if (y >= y_end) - y -= y_size; - - /* Update processed samples count for next loop iteration. */ - frames_remain -= n; - } - - /* Update the sink for bytes produced. Return success. */ - sink_commit_buffer(sink, bytes); - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - obuf->s_avail -= frames; - obuf->s_free += frames; - } - - return 0; -} -#endif /* CONFIG_FORMAT_S16LE */ - -void stft_process_fill_prev_samples(struct stft_process_buffer *buf, int32_t *prev_data, - int prev_data_length) -{ - /* Fill prev_data from input buffer */ - int32_t *r = buf->r_ptr; - int32_t *p = prev_data; - int copied; - int nmax; - int n; - - for (copied = 0; copied < prev_data_length; copied += n) { - nmax = prev_data_length - copied; - n = stft_process_buffer_samples_without_wrap(buf, r); - n = MIN(n, nmax); - memcpy(p, r, sizeof(int32_t) * n); /* Not using memcpy_s() due to speed need */ - p += n; - r += n; - r = stft_process_buffer_wrap(buf, r); - } - - buf->s_avail -= copied; - buf->s_free += copied; - buf->r_ptr = r; -} - -void stft_process_fill_fft_buffer(struct stft_process_state *state, int ch) -{ - struct stft_process_buffer *ibuf = &state->ibuf[ch]; - struct stft_process_fft *fft = &state->fft; - int32_t *prev_data = state->prev_data[ch]; - int32_t *r = ibuf->r_ptr; - int copied; - int nmax; - int idx; - int j; - int n; - - /* Copy overlapped samples from state buffer. Imaginary part of input - * remains zero. - */ - for (j = 0; j < state->prev_data_size; j++) { - fft->fft_buf[j].real = prev_data[j]; - fft->fft_buf[j].imag = 0; - } - - /* Copy hop size of new data from circular buffer */ - idx = state->prev_data_size; - for (copied = 0; copied < fft->fft_hop_size; copied += n) { - nmax = fft->fft_hop_size - copied; - n = stft_process_buffer_samples_without_wrap(ibuf, r); - n = MIN(n, nmax); - for (j = 0; j < n; j++) { - fft->fft_buf[idx].real = *r++; - fft->fft_buf[idx].imag = 0; - idx++; - } - r = stft_process_buffer_wrap(ibuf, r); - } - - ibuf->s_avail -= copied; - ibuf->s_free += copied; - ibuf->r_ptr = r; - - /* Copy for next time data back to overlap buffer */ - idx = fft->fft_hop_size; - for (j = 0; j < state->prev_data_size; j++) - prev_data[j] = fft->fft_buf[idx + j].real; -} - +#if SOF_USE_HIFI(NONE, COMP_STFT_PROCESS) void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ch) { struct stft_process_buffer *obuf = &state->obuf[ch]; @@ -405,6 +23,11 @@ void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int while (samples_remain) { n = stft_process_buffer_samples_without_wrap(obuf, w); n = MIN(samples_remain, n); + + /* Abort if n is zero to avoid infinite loop. The assert can + * trigger only with incorrect usage of this function. + */ + assert(n); for (i = 0; i < n; i++) { sample = Q_MULTSR_32X32((int64_t)state->gain_comp, fft->fft_buf[idx].real, 31, 31, 31); @@ -434,3 +57,4 @@ void stft_process_apply_window(struct stft_process_state *state) sat_int32(Q_MULTSR_32X32((int64_t)fft->fft_buf[i + j].real, state->window[j], 31, 31, 31)); } +#endif /* SOF_USE_HIFI(NONE, COMP_STFT_PROCESS) */ diff --git a/src/audio/stft_process/stft_process-hifi3.c b/src/audio/stft_process/stft_process-hifi3.c new file mode 100644 index 000000000000..ae7f4f9b69dd --- /dev/null +++ b/src/audio/stft_process/stft_process-hifi3.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2025-2026 Intel Corporation. + +/** + * \file + * \brief HiFi3 SIMD-optimized helpers for the STFT processing component. + * + * This compilation unit provides HiFi3 intrinsic versions of selected + * hot-path helpers. It is guarded by SOF_USE_MIN_HIFI(3, COMP_STFT_PROCESS) + * so only one of the generic or hifi implementations is active. + */ + +#include +#include +#include +#include +#include "stft_process.h" + +#if SOF_USE_MIN_HIFI(3, COMP_STFT_PROCESS) + +#include + +/** + * stft_process_apply_window() - Multiply FFT buffer by the analysis window. + * @state: STFT processing state that contains the FFT buffer and window. + * + * The real part of each icomplex32 sample in the FFT buffer is multiplied + * by the corresponding Q1.31 window coefficient. + */ +void stft_process_apply_window(struct stft_process_state *state) +{ + struct stft_process_fft *fft = &state->fft; + ae_int32 *buf; + const ae_int32x2 *win; + ae_f32x2 data01, data23; + ae_f32x2 win01, win23; + ae_int32x2 d0, d1; + int fft_size = fft->fft_size; + int i = fft->fft_fill_start_idx; + int j; + int n4; + + /* + * buf points to {real, imag} pairs (struct icomplex32). + * win points to scalar Q1.31 window coefficients. + * + * We load each complex pair, multiply only the real part by the + * window value, then store the pair back with the updated real. + * The imaginary part is left untouched. + * + * Stride for buf is sizeof(ae_int32x2) = 8 bytes per complex sample. + * Stride for win is sizeof(ae_int32) = 4 bytes per scalar window value. + */ + buf = (ae_int32 *)&fft->fft_buf[i]; + win = (const ae_int32x2 *)state->window; + + assert(!(fft_size & 3)); + + /* Main loop: process 4 samples per iteration */ + n4 = fft_size >> 2; + for (j = 0; j < n4; j++) { + /* Load four FFT real part values, combine into fft_data, + * buf[0] goes to data01 low, buf[1] goes to data01 high. + */ + d0 = AE_L32_I(buf, 0 * sizeof(ae_int32x2)); + d1 = AE_L32_I(buf, 1 * sizeof(ae_int32x2)); + data01 = AE_SEL32_HH(d0, d1); + d0 = AE_L32_I(buf, 2 * sizeof(ae_int32x2)); + d1 = AE_L32_I(buf, 3 * sizeof(ae_int32x2)); + data23 = AE_SEL32_HH(d0, d1); + + /* Load four window coefficients, + * win[0] goes to win01 low, win[1] goes to win01 high + */ + AE_L32X2_IP(win01, win, sizeof(ae_int32x2)); + AE_L32X2_IP(win23, win, sizeof(ae_int32x2)); + + /* Multiply with window function */ + data01 = AE_MULFP32X2RS(data01, win01); + data23 = AE_MULFP32X2RS(data23, win23); + + /* Store back the updated real parts */ + AE_S32_L_IP(AE_SEL32_LH(data01, data01), buf, sizeof(ae_int32x2)); + AE_S32_L_IP(data01, buf, sizeof(ae_int32x2)); + AE_S32_L_IP(AE_SEL32_LH(data23, data23), buf, sizeof(ae_int32x2)); + AE_S32_L_IP(data23, buf, sizeof(ae_int32x2)); + } +} + +/** + * stft_process_overlap_add_ifft_buffer() - Overlap-add IFFT output to circular output buffer. + * @state: STFT processing state. + * @ch: Channel index. + * + * Each IFFT output sample is multiplied by gain_comp (Q1.31 x Q1.31) and + * added with saturation to the existing content of the circular output + * buffer. HiFi3 AE_MULF32S_HH handles the multiply and + * AE_ADD32S provides the saturating accumulation. + * + * Note: obuf must be even number of samples and 64-bit aligned. + */ +void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ch) +{ + struct stft_process_buffer *obuf = &state->obuf[ch]; + struct stft_process_fft *fft = &state->fft; + ae_f32x2 gain = AE_MOVDA32(state->gain_comp); + ae_f32x2 buffer_data; + ae_f32x2 fft_data; + ae_f32x2 d0, d1; + ae_f32x2 *w = (ae_f32x2 *)obuf->w_ptr; + ae_f32 *fft_p = (ae_f32 *)&fft->fft_buf[fft->fft_fill_start_idx]; + int samples_remain = fft->fft_size; + int i, n; + + while (samples_remain) { + n = stft_process_buffer_samples_without_wrap(obuf, (int32_t *)w); + + /* The samples count must be even and not zero, the latter to avoid infinite + * loop. The assert can trigger only with incorrect usage of this function. + */ + assert(n && !(n & 1)); + n = MIN(samples_remain, n) >> 1; + for (i = 0; i < n; i++) { + /* Load two FFT real part values, combine into fft_data */ + AE_L32_IP(d0, fft_p, sizeof(ae_f32x2)); + AE_L32_IP(d1, fft_p, sizeof(ae_f32x2)); + fft_data = AE_SEL32_HH(d0, d1); + + /* Load buffer data, multiply fft_data with gain and accumulate, and + * store to output buffer. + */ + buffer_data = AE_L32X2_I(w, 0); + AE_MULAFP32X2RS(buffer_data, fft_data, gain); + AE_S32X2_IP(buffer_data, w, sizeof(ae_f32x2)); + } + w = (ae_f32x2 *)stft_process_buffer_wrap(obuf, (int32_t *)w); + samples_remain -= n << 1; + } + + obuf->w_ptr = stft_process_buffer_wrap(obuf, obuf->w_ptr + fft->fft_hop_size); + obuf->s_avail += fft->fft_hop_size; + obuf->s_free -= fft->fft_hop_size; +} + +#endif /* SOF_USE_MIN_HIFI(3, COMP_STFT_PROCESS) */ diff --git a/src/audio/stft_process/stft_process.h b/src/audio/stft_process/stft_process.h index c6b4dd6c8893..1323d948a6c4 100644 --- a/src/audio/stft_process/stft_process.h +++ b/src/audio/stft_process/stft_process.h @@ -213,28 +213,128 @@ static inline int stft_process_get_config(struct processing_module *mod, uint32_ } #endif +/** + * stft_process_setup() - Initialize STFT processing state and allocate buffers. + * @mod: Pointer to processing module. + * @max_frames: Maximum number of frames per processing call. + * @rate: Audio sample rate in Hz. + * @channels: Number of audio channels. + * + * Configures FFT parameters, allocates aligned sample and FFT buffers, + * sets up window function, and creates FFT/IFFT plans based on the + * component configuration. + * + * Return: Zero on success, otherwise a negative error code. + */ int stft_process_setup(struct processing_module *mod, int max_frames, int rate, int channels); +/** + * stft_process_source_s16() - Copy S16_LE source data to STFT internal buffers. + * @cd: STFT component data. + * @source: Source for PCM samples data. + * @frames: Number of audio data frames to process. + * + * De-interleaves S16_LE audio frames from the source circular buffer + * into per-channel internal circular buffers. Each 16-bit sample is + * converted to Q1.31 format by left-shifting 16 bits. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_source_s16(struct stft_comp_data *cd, struct sof_source *source, int frames); +/** + * stft_process_sink_s16() - Copy STFT internal buffers to S16_LE sink. + * @cd: STFT component data. + * @sink: Sink for PCM samples data. + * @frames: Number of audio data frames to produce. + * + * Interleaves per-channel STFT output buffers into the sink circular + * buffer in S16_LE format. Q1.31 samples are converted to Q1.15 with + * rounding and saturation. Output buffer samples are cleared after + * reading to prepare for the next overlap-add cycle. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_sink_s16(struct stft_comp_data *cd, struct sof_sink *sink, int frames); +/** + * stft_process_source_s32() - Copy S32_LE source data to STFT internal buffers. + * @cd: STFT component data. + * @source: Source for PCM samples data. + * @frames: Number of audio data frames to process. + * + * De-interleaves S32_LE audio frames from the source circular buffer + * into per-channel internal circular buffers. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_source_s32(struct stft_comp_data *cd, struct sof_source *source, int frames); +/** + * stft_process_sink_s32() - Copy STFT internal buffers to S32_LE sink. + * @cd: STFT component data. + * @sink: Sink for PCM samples data. + * @frames: Number of audio data frames to produce. + * + * Interleaves per-channel STFT output buffers into the sink circular + * buffer in S32_LE format. The output buffer samples are cleared after + * reading to prepare for the next overlap-add cycle. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_sink_s32(struct stft_comp_data *cd, struct sof_sink *sink, int frames); +/** + * stft_process_free_buffers() - Free all STFT processing buffers. + * @mod: Pointer to processing module. + * + * Releases sample buffers, FFT buffers, and FFT/IFFT plans allocated + * during stft_process_setup(). + */ void stft_process_free_buffers(struct processing_module *mod); -void stft_process_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); - +/** + * stft_process_fill_prev_samples() - Save overlap samples for next STFT frame. + * @buf: Circular buffer to read overlap samples from. + * @prev_data: Destination array for the overlap data. + * @prev_data_length: Number of samples to copy. + * + * Copies prev_data_length samples from the circular buffer into the + * linear prev_data array, handling wrap-around as needed. + */ void stft_process_fill_prev_samples(struct stft_process_buffer *buf, int32_t *prev_data, int prev_data_length); +/** + * stft_process_fill_fft_buffer() - Assemble FFT input from overlap and new data. + * @state: STFT processing state. + * @ch: Channel index. + * + * Constructs the FFT input buffer by concatenating the previous overlap + * samples and one hop of new samples from the input circular buffer. + * Imaginary parts are set to zero. The overlap buffer is updated with + * data for the next frame. + */ void stft_process_fill_fft_buffer(struct stft_process_state *state, int ch); +/** + * stft_process_apply_window() - Multiply FFT buffer by the analysis window. + * @state: STFT processing state that contains the FFT buffer and window. + * + * The real part of each complex sample in the FFT buffer is multiplied + * by the corresponding Q1.31 window coefficient. + */ void stft_process_apply_window(struct stft_process_state *state); +/** + * stft_process_overlap_add_ifft_buffer() - Overlap-add IFFT output to circular output buffer. + * @state: STFT processing state. + * @ch: Channel index. + * + * Each IFFT output sample is multiplied by the gain compensation value + * and added with saturation to the existing content of the circular + * output buffer. + */ void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ch); #endif // __SOF_AUDIO_STFT_PROCESS_H__ diff --git a/src/audio/stft_process/stft_process_common.c b/src/audio/stft_process/stft_process_common.c index 2fcaeb349b84..6ab3199082de 100644 --- a/src/audio/stft_process/stft_process_common.c +++ b/src/audio/stft_process/stft_process_common.c @@ -4,6 +4,10 @@ #include #include +#include +#include +#include +#include #include #include #include @@ -17,6 +21,7 @@ #include #include #include +#include #if STFT_DEBUG extern FILE *stft_debug_fft_in_fh; @@ -36,6 +41,313 @@ static void debug_print_to_file_complex(FILE *fh, struct icomplex32 *c, int n) } #endif +#if CONFIG_FORMAT_S32LE +int stft_process_source_s32(struct stft_comp_data *cd, struct sof_source *source, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *ibuf; + int32_t const *x, *x_start, *x_end; + int x_size; + int bytes = frames * cd->frame_bytes; + int frames_left = frames; + int ret; + int n1; + int n2; + int channels = cd->channels; + int n; + int i; + int j; + + /* Get pointer to source data in circular buffer */ + ret = source_get_data_s32(source, bytes, &x, &x_start, &x_size); + if (ret) + return ret; + + /* Set helper pointers to buffer end for wrap check. Then loop until all + * samples are processed. + */ + x_end = x_start + x_size; + + while (frames_left) { + /* Find out samples to process before first wrap or end of data. */ + ibuf = &state->ibuf[0]; + n1 = (x_end - x) / cd->channels; + n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); + n = MIN(n1, n2); + n = MIN(n, frames_left); + for (i = 0; i < n; i++) { + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + *ibuf->w_ptr++ = *x++; + } + } + + /* One of the buffers needs a wrap (or end of data), so check for wrap */ + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); + } + + if (x >= x_end) + x -= x_size; + + /* Update processed samples count for next loop iteration. */ + frames_left -= n; + } + + /* Update the source for bytes consumed. Return success. */ + source_release_data(source, bytes); + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->s_avail += frames; + ibuf->s_free -= frames; + } + + return 0; +} + +int stft_process_sink_s32(struct stft_comp_data *cd, struct sof_sink *sink, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *obuf; + int32_t *y, *y_start, *y_end; + int frames_remain = frames; + int channels = cd->channels; + int bytes = frames * cd->frame_bytes; + int y_size; + int ret; + int ch, n1, n, i; + + /* Get pointer to sink data in circular buffer */ + ret = sink_get_buffer_s32(sink, bytes, &y, &y_start, &y_size); + if (ret) + return ret; + + /* Set helper pointers to buffer end for wrap check. Then loop until all + * samples are processed. + */ + y_end = y_start + y_size; + while (frames_remain) { + /* Find out samples to process before first wrap or end of data. */ + obuf = &state->obuf[0]; + n1 = (y_end - y) / cd->channels; + n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); + n = MIN(n1, n); + n = MIN(n, frames_remain); + + for (i = 0; i < n; i++) { + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + *y++ = *obuf->r_ptr; + *obuf->r_ptr++ = 0; /* clear overlap add mix */ + } + } + + /* One of the buffers needs a wrap (or end of data), so check for wrap */ + for (ch = 0; ch < cd->channels; ch++) { + obuf = &state->obuf[ch]; + obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); + } + + if (y >= y_end) + y -= y_size; + + /* Update processed samples count for next loop iteration. */ + frames_remain -= n; + } + + /* Update the sink for bytes produced. Return success. */ + sink_commit_buffer(sink, bytes); + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + obuf->s_avail -= frames; + obuf->s_free += frames; + } + + return 0; +} +#endif /* CONFIG_FORMAT_S32LE */ + +#if CONFIG_FORMAT_S16LE +int stft_process_source_s16(struct stft_comp_data *cd, struct sof_source *source, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *ibuf; + int16_t const *x, *x_start, *x_end; + int16_t in; + int x_size; + int channels = cd->channels; + int bytes = frames * cd->frame_bytes; + int frames_left = frames; + int ret; + int n1; + int n2; + int n; + int i; + int j; + + ret = source_get_data_s16(source, bytes, &x, &x_start, &x_size); + if (ret) + return ret; + + x_end = x_start + x_size; + + while (frames_left) { + ibuf = &state->ibuf[0]; + n1 = (x_end - x) / cd->channels; + n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); + n = MIN(n1, n2); + n = MIN(n, frames_left); + for (i = 0; i < n; i++) { + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + in = *x++; + *ibuf->w_ptr++ = (int32_t)in << 16; + } + } + + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); + } + + if (x >= x_end) + x -= x_size; + + frames_left -= n; + } + + source_release_data(source, bytes); + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->s_avail += frames; + ibuf->s_free -= frames; + } + return 0; +} + +int stft_process_sink_s16(struct stft_comp_data *cd, struct sof_sink *sink, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *obuf; + int16_t *y, *y_start, *y_end; + int frames_remain = frames; + int channels = cd->channels; + int bytes = frames * cd->frame_bytes; + int y_size; + int ret; + int ch, n1, n, i; + + ret = sink_get_buffer_s16(sink, bytes, &y, &y_start, &y_size); + if (ret) + return ret; + + y_end = y_start + y_size; + while (frames_remain) { + obuf = &state->obuf[0]; + n1 = (y_end - y) / cd->channels; + n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); + n = MIN(n1, n); + n = MIN(n, frames_remain); + + for (i = 0; i < n; i++) { + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + *y++ = sat_int16(Q_SHIFT_RND(*obuf->r_ptr, 31, 15)); + *obuf->r_ptr++ = 0; /* clear overlap add mix */ + } + } + + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); + } + + if (y >= y_end) + y -= y_size; + + frames_remain -= n; + } + + sink_commit_buffer(sink, bytes); + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + obuf->s_avail -= frames; + obuf->s_free += frames; + } + + return 0; +} +#endif /* CONFIG_FORMAT_S16LE */ + +void stft_process_fill_prev_samples(struct stft_process_buffer *buf, int32_t *prev_data, + int prev_data_length) +{ + int32_t *r = buf->r_ptr; + int32_t *p = prev_data; + int copied; + int nmax; + int n; + + for (copied = 0; copied < prev_data_length; copied += n) { + nmax = prev_data_length - copied; + n = stft_process_buffer_samples_without_wrap(buf, r); + n = MIN(n, nmax); + memcpy(p, r, sizeof(int32_t) * n); + p += n; + r += n; + r = stft_process_buffer_wrap(buf, r); + } + + buf->s_avail -= copied; + buf->s_free += copied; + buf->r_ptr = r; +} + +void stft_process_fill_fft_buffer(struct stft_process_state *state, int ch) +{ + struct stft_process_buffer *ibuf = &state->ibuf[ch]; + struct stft_process_fft *fft = &state->fft; + int32_t *prev_data = state->prev_data[ch]; + int32_t *r = ibuf->r_ptr; + int copied; + int nmax; + int idx; + int j; + int n; + + /* Copy overlapped samples from state buffer. Imaginary part of input + * remains zero. + */ + for (j = 0; j < state->prev_data_size; j++) { + fft->fft_buf[j].real = prev_data[j]; + fft->fft_buf[j].imag = 0; + } + + /* Copy hop size of new data from circular buffer */ + idx = state->prev_data_size; + for (copied = 0; copied < fft->fft_hop_size; copied += n) { + nmax = fft->fft_hop_size - copied; + n = stft_process_buffer_samples_without_wrap(ibuf, r); + n = MIN(n, nmax); + for (j = 0; j < n; j++) { + fft->fft_buf[idx].real = *r++; + fft->fft_buf[idx].imag = 0; + idx++; + } + r = stft_process_buffer_wrap(ibuf, r); + } + + ibuf->s_avail -= copied; + ibuf->s_free += copied; + ibuf->r_ptr = r; + + /* Copy for next time data back to overlap buffer */ + idx = fft->fft_hop_size; + for (j = 0; j < state->prev_data_size; j++) + prev_data[j] = fft->fft_buf[idx + j].real; +} + LOG_MODULE_REGISTER(stft_process_common, CONFIG_SOF_LOG_LEVEL); /* diff --git a/src/audio/stft_process/stft_process_setup.c b/src/audio/stft_process/stft_process_setup.c index 1e5b9c534205..dc854e77bca1 100644 --- a/src/audio/stft_process/stft_process_setup.c +++ b/src/audio/stft_process/stft_process_setup.c @@ -90,6 +90,8 @@ int stft_process_setup(struct processing_module *mod, int max_frames, return -EINVAL; } + /* max_frames needs to be even for buffer size allocation for Xtensa HiFi SIMD. */ + max_frames = ALIGN_UP(max_frames, 2); cd->max_frames = max_frames; state->sample_rate = sample_rate; @@ -110,25 +112,37 @@ int stft_process_setup(struct processing_module *mod, int max_frames, fft->fft_hop_size = config->frame_shift; fft->half_fft_size = (fft->fft_padded_size >> 1) + 1; + /* FFT size needs to be a multiple of 4 for Xtensa HiFi SIMD, + * and FFT hop size needs to be a multiple of 2. Check also + * for otherwise sane values. + */ + if (fft->fft_size <= 0 || fft->fft_hop_size <= 0 || + fft->fft_hop_size > fft->fft_size || + (fft->fft_size & 3) || (fft->fft_hop_size & 1)) { + comp_err(dev, "FFT size %d or hop size %d are invalid.", + fft->fft_size, fft->fft_hop_size); + return -EINVAL; + } + comp_info(dev, "fft_size = %d, fft_hop_size = %d, window = %d", fft->fft_size, fft->fft_hop_size, config->window); /* Calculated parameters */ state->prev_data_size = fft->fft_size - fft->fft_hop_size; - ibuf_size = fft->fft_hop_size + cd->max_frames; - obuf_size = fft->fft_size + cd->max_frames; + ibuf_size = fft->fft_hop_size + max_frames; + obuf_size = fft->fft_size + max_frames; prev_size = state->prev_data_size; /* Allocate buffer input samples, overlap buffer, window */ - sample_buffers_size = sizeof(int32_t) * cd->channels * - (ibuf_size + obuf_size + prev_size + fft->fft_size); + sample_buffers_size = sizeof(int32_t) * + (cd->channels * (ibuf_size + obuf_size + prev_size) + fft->fft_size); - if (sample_buffers_size > STFT_MAX_ALLOC_SIZE || sample_buffers_size < 0) { + if (sample_buffers_size > STFT_MAX_ALLOC_SIZE) { comp_err(dev, "Illegal allocation size"); - return -EINVAL; + return -ENOMEM; } - state->buffers = mod_balloc(mod, sample_buffers_size); + state->buffers = mod_balloc_align(mod, sample_buffers_size, 2 * sizeof(int32_t)); if (!state->buffers) { comp_err(dev, "Failed buffer allocate"); ret = -ENOMEM; @@ -149,14 +163,14 @@ int stft_process_setup(struct processing_module *mod, int max_frames, /* Allocate buffers for FFT input and output data */ fft->fft_buffer_size = fft->fft_padded_size * sizeof(struct icomplex32); - fft->fft_buf = mod_balloc(mod, fft->fft_buffer_size); + fft->fft_buf = mod_balloc_align(mod, fft->fft_buffer_size, sizeof(struct icomplex32)); if (!fft->fft_buf) { comp_err(dev, "Failed FFT buffer allocate"); ret = -ENOMEM; goto free_buffers; } - fft->fft_out = mod_balloc(mod, fft->fft_buffer_size); + fft->fft_out = mod_balloc_align(mod, fft->fft_buffer_size, sizeof(struct icomplex32)); if (!fft->fft_out) { comp_err(dev, "Failed FFT output allocate"); ret = -ENOMEM; From f51925c7d11f45a8c08b8218e88deae33a03f814 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 25 Mar 2026 19:24:09 +0200 Subject: [PATCH 2/2] Audio: STFT Process: Remove unused fill_start_idx This patch removes fill_start_idx member from struct stft_process_fft. It would have required another check for data align and samples amount for Xtensa HIFI SIMD code version. There is no need for different FFT padding types (left, center, right as in MFCC) in this component, so it's safe to remove. Signed-off-by: Seppo Ingalsuo --- src/audio/stft_process/stft_process-generic.c | 7 +++---- src/audio/stft_process/stft_process-hifi3.c | 5 ++--- src/audio/stft_process/stft_process.h | 1 - src/audio/stft_process/stft_process_setup.c | 2 -- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/audio/stft_process/stft_process-generic.c b/src/audio/stft_process/stft_process-generic.c index 6ade95bb6ae8..5241c372261f 100644 --- a/src/audio/stft_process/stft_process-generic.c +++ b/src/audio/stft_process/stft_process-generic.c @@ -18,7 +18,7 @@ void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int int i; int n; int samples_remain = fft->fft_size; - int idx = fft->fft_fill_start_idx; + int idx = 0; while (samples_remain) { n = stft_process_buffer_samples_without_wrap(obuf, w); @@ -49,12 +49,11 @@ void stft_process_apply_window(struct stft_process_state *state) { struct stft_process_fft *fft = &state->fft; int j; - int i = fft->fft_fill_start_idx; /* Multiply Q1.31 by Q1.15 gives Q2.46, shift right by 15 to get Q2.31, no saturate need */ for (j = 0; j < fft->fft_size; j++) - fft->fft_buf[i + j].real = - sat_int32(Q_MULTSR_32X32((int64_t)fft->fft_buf[i + j].real, + fft->fft_buf[j].real = + sat_int32(Q_MULTSR_32X32((int64_t)fft->fft_buf[j].real, state->window[j], 31, 31, 31)); } #endif /* SOF_USE_HIFI(NONE, COMP_STFT_PROCESS) */ diff --git a/src/audio/stft_process/stft_process-hifi3.c b/src/audio/stft_process/stft_process-hifi3.c index ae7f4f9b69dd..6cf7c3dc7e85 100644 --- a/src/audio/stft_process/stft_process-hifi3.c +++ b/src/audio/stft_process/stft_process-hifi3.c @@ -37,7 +37,6 @@ void stft_process_apply_window(struct stft_process_state *state) ae_f32x2 win01, win23; ae_int32x2 d0, d1; int fft_size = fft->fft_size; - int i = fft->fft_fill_start_idx; int j; int n4; @@ -52,7 +51,7 @@ void stft_process_apply_window(struct stft_process_state *state) * Stride for buf is sizeof(ae_int32x2) = 8 bytes per complex sample. * Stride for win is sizeof(ae_int32) = 4 bytes per scalar window value. */ - buf = (ae_int32 *)&fft->fft_buf[i]; + buf = (ae_int32 *)fft->fft_buf; win = (const ae_int32x2 *)state->window; assert(!(fft_size & 3)); @@ -109,7 +108,7 @@ void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ae_f32x2 fft_data; ae_f32x2 d0, d1; ae_f32x2 *w = (ae_f32x2 *)obuf->w_ptr; - ae_f32 *fft_p = (ae_f32 *)&fft->fft_buf[fft->fft_fill_start_idx]; + ae_f32 *fft_p = (ae_f32 *)fft->fft_buf; int samples_remain = fft->fft_size; int i, n; diff --git a/src/audio/stft_process/stft_process.h b/src/audio/stft_process/stft_process.h index 1323d948a6c4..558023ba55bc 100644 --- a/src/audio/stft_process/stft_process.h +++ b/src/audio/stft_process/stft_process.h @@ -66,7 +66,6 @@ struct stft_process_fft { struct ipolar32 *fft_polar; struct fft_multi_plan *fft_plan; struct fft_multi_plan *ifft_plan; - int fft_fill_start_idx; /**< Set to 0 for pad left, etc. */ int fft_size; int fft_padded_size; int fft_hop_size; diff --git a/src/audio/stft_process/stft_process_setup.c b/src/audio/stft_process/stft_process_setup.c index dc854e77bca1..377d5feeabc8 100644 --- a/src/audio/stft_process/stft_process_setup.c +++ b/src/audio/stft_process/stft_process_setup.c @@ -180,8 +180,6 @@ int stft_process_setup(struct processing_module *mod, int max_frames, /* Share the fft_out buffer for polar format */ fft->fft_polar = (struct ipolar32 *)fft->fft_out; - fft->fft_fill_start_idx = 0; /* From config pad_type */ - /* Setup FFT */ fft->fft_plan = mod_fft_multi_plan_new(mod, fft->fft_buf, fft->fft_out, fft->fft_padded_size, 32);