diff --git a/src/audio/stft_process/CMakeLists.txt b/src/audio/stft_process/CMakeLists.txt index 66ccb2276c58..e3a13e17bb93 100644 --- a/src/audio/stft_process/CMakeLists.txt +++ b/src/audio/stft_process/CMakeLists.txt @@ -8,6 +8,7 @@ else() add_local_sources(sof stft_process_setup.c) add_local_sources(sof stft_process_common.c) add_local_sources(sof stft_process-generic.c) + add_local_sources(sof stft_process-hifi3.c) if(CONFIG_IPC_MAJOR_4) add_local_sources(sof stft_process-ipc4.c) diff --git a/src/audio/stft_process/Kconfig b/src/audio/stft_process/Kconfig index b73bdebe3bb4..426d272477a9 100644 --- a/src/audio/stft_process/Kconfig +++ b/src/audio/stft_process/Kconfig @@ -19,6 +19,8 @@ config COMP_STFT_PROCESS if COMP_STFT_PROCESS +rsource "Kconfig.simd" + config STFT_PROCESS_MAGNITUDE_PHASE bool "Convert FFTs to polar magnitude and phase" default n diff --git a/src/audio/stft_process/Kconfig.simd b/src/audio/stft_process/Kconfig.simd new file mode 100644 index 000000000000..0b981df583fa --- /dev/null +++ b/src/audio/stft_process/Kconfig.simd @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause + +comment "STFT Process optimization level select" + +choice "COMP_STFT_PROCESS_SIMD_LEVEL_SELECT" + prompt "Choose which SIMD level is used for the STFT Process module" + depends on COMP_STFT_PROCESS + default COMP_STFT_PROCESS_HIFI_MAX + + config COMP_STFT_PROCESS_HIFI_MAX + prompt "SIMD will be selected by toolchain pre-defined header" + bool + help + When this is selected, the optimization level will be + determined by the toolchain pre-defined macros in the + core isa header file. + + config COMP_STFT_PROCESS_HIFI_3 + prompt "Choose HIFI3 intrinsic optimized STFT Process module" + bool + help + This option is used to build HIFI3 intrinsic optimized + STFT Process code. + + config COMP_STFT_PROCESS_HIFI_NONE + prompt "Choose generic C STFT Process module, no HIFI SIMD involved" + bool + help + This option is used to build STFT Process + with generic C code. +endchoice diff --git a/src/audio/stft_process/stft_process-generic.c b/src/audio/stft_process/stft_process-generic.c index 3399c24657a4..5241c372261f 100644 --- a/src/audio/stft_process/stft_process-generic.c +++ b/src/audio/stft_process/stft_process-generic.c @@ -1,396 +1,14 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2025 Intel Corporation. +// Copyright(c) 2025-2026 Intel Corporation. -#include -#include #include -#include -#include -#include +#include +#include #include #include "stft_process.h" -#if CONFIG_FORMAT_S32LE -/** - * stft_process_source_s32() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_source_s32(struct stft_comp_data *cd, struct sof_source *source, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *ibuf; - int32_t const *x, *x_start, *x_end; - int x_size; - int bytes = frames * cd->frame_bytes; - int frames_left = frames; - int ret; - int n1; - int n2; - int channels = cd->channels; - int n; - int i; - int j; - - /* Get pointer to source data in circular buffer */ - ret = source_get_data_s32(source, bytes, &x, &x_start, &x_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - x_end = x_start + x_size; - - while (frames_left) { - /* Find out samples to process before first wrap or end of data. */ - ibuf = &state->ibuf[0]; - n1 = (x_end - x) / cd->channels; - n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); - n = MIN(n1, n2); - n = MIN(n, frames_left); - for (i = 0; i < n; i++) { - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - *ibuf->w_ptr++ = *x++; - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); - } - - if (x >= x_end) - x -= x_size; - - /* Update processed samples count for next loop iteration. */ - frames_left -= n; - } - - /* Update the source for bytes consumed. Return success. */ - source_release_data(source, bytes); - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->s_avail += frames; - ibuf->s_free -= frames; - } - - return 0; -} - -/** - * stft_process_sink_s32() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_sink_s32(struct stft_comp_data *cd, struct sof_sink *sink, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *obuf; - int32_t *y, *y_start, *y_end; - int frames_remain = frames; - int channels = cd->channels; - int bytes = frames * cd->frame_bytes; - int y_size; - int ret; - int ch, n1, n, i; - - /* Get pointer to sink data in circular buffer */ - ret = sink_get_buffer_s32(sink, bytes, &y, &y_start, &y_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - y_end = y_start + y_size; - while (frames_remain) { - /* Find out samples to process before first wrap or end of data. */ - obuf = &state->obuf[0]; - n1 = (y_end - y) / cd->channels; - n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); - n = MIN(n1, n); - n = MIN(n, frames_remain); - - for (i = 0; i < n; i++) { - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - *y++ = *obuf->r_ptr; - *obuf->r_ptr++ = 0; /* clear overlap add mix */ - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (ch = 0; ch < cd->channels; ch++) { - obuf = &state->obuf[ch]; - obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); - } - - if (y >= y_end) - y -= y_size; - - /* Update processed samples count for next loop iteration. */ - frames_remain -= n; - } - - /* Update the sink for bytes produced. Return success. */ - sink_commit_buffer(sink, bytes); - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - obuf->s_avail -= frames; - obuf->s_free += frames; - } - - return 0; -} -#endif /* CONFIG_FORMAT_S32LE */ - -#if CONFIG_FORMAT_S16LE -/** - * stft_process_source_s16() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_source_s16(struct stft_comp_data *cd, struct sof_source *source, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *ibuf; - int16_t const *x, *x_start, *x_end; - int16_t in; - int x_size; - int channels = cd->channels; - int bytes = frames * cd->frame_bytes; - int frames_left = frames; - int ret; - int n1; - int n2; - int n; - int i; - int j; - - /* Get pointer to source data in circular buffer, get buffer start and size to - * check for wrap. The size in bytes is converted to number of s16 samples to - * control the samples process loop. If the number of bytes requested is not - * possible, an error is returned. - */ - ret = source_get_data_s16(source, bytes, &x, &x_start, &x_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - x_end = x_start + x_size; - - while (frames_left) { - /* Find out samples to process before first wrap or end of data. */ - ibuf = &state->ibuf[0]; - n1 = (x_end - x) / cd->channels; - n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); - n = MIN(n1, n2); - n = MIN(n, frames_left); - for (i = 0; i < n; i++) { - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - in = *x++; - *ibuf->w_ptr++ = (int32_t)in << 16; - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); - } - - if (x >= x_end) - x -= x_size; - - /* Update processed samples count for next loop iteration. */ - frames_left -= n; - } - - /* Update the source for bytes consumed. Return success. */ - source_release_data(source, bytes); - for (j = 0; j < channels; j++) { - ibuf = &state->ibuf[j]; - ibuf->s_avail += frames; - ibuf->s_free -= frames; - } - return 0; -} - -/** - * stft_process_sink_s16() - Process S16_LE format. - * @mod: Pointer to module data. - * @source: Source for PCM samples data. - * @sink: Sink for PCM samples data. - * @frames: Number of audio data frames to process. - * - * This is the processing function for 16-bit signed integer PCM formats. The - * audio samples in every frame are re-order to channels order defined in - * component data channel_map[]. - * - * Return: Value zero for success, otherwise an error code. - */ -int stft_process_sink_s16(struct stft_comp_data *cd, struct sof_sink *sink, int frames) -{ - struct stft_process_state *state = &cd->state; - struct stft_process_buffer *obuf; - int16_t *y, *y_start, *y_end; - int frames_remain = frames; - int channels = cd->channels; - int bytes = frames * cd->frame_bytes; - int y_size; - int ret; - int ch, n1, n, i; - - /* Get pointer to sink data in circular buffer */ - ret = sink_get_buffer_s16(sink, bytes, &y, &y_start, &y_size); - if (ret) - return ret; - - /* Set helper pointers to buffer end for wrap check. Then loop until all - * samples are processed. - */ - y_end = y_start + y_size; - while (frames_remain) { - /* Find out samples to process before first wrap or end of data. */ - obuf = &state->obuf[0]; - n1 = (y_end - y) / cd->channels; - n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); - n = MIN(n1, n); - n = MIN(n, frames_remain); - - for (i = 0; i < n; i++) { - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - *y++ = sat_int16(Q_SHIFT_RND(*obuf->r_ptr, 31, 15)); - *obuf->r_ptr++ = 0; /* clear overlap add mix */ - } - } - - /* One of the buffers needs a wrap (or end of data), so check for wrap */ - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); - } - - if (y >= y_end) - y -= y_size; - - /* Update processed samples count for next loop iteration. */ - frames_remain -= n; - } - - /* Update the sink for bytes produced. Return success. */ - sink_commit_buffer(sink, bytes); - for (ch = 0; ch < channels; ch++) { - obuf = &state->obuf[ch]; - obuf->s_avail -= frames; - obuf->s_free += frames; - } - - return 0; -} -#endif /* CONFIG_FORMAT_S16LE */ - -void stft_process_fill_prev_samples(struct stft_process_buffer *buf, int32_t *prev_data, - int prev_data_length) -{ - /* Fill prev_data from input buffer */ - int32_t *r = buf->r_ptr; - int32_t *p = prev_data; - int copied; - int nmax; - int n; - - for (copied = 0; copied < prev_data_length; copied += n) { - nmax = prev_data_length - copied; - n = stft_process_buffer_samples_without_wrap(buf, r); - n = MIN(n, nmax); - memcpy(p, r, sizeof(int32_t) * n); /* Not using memcpy_s() due to speed need */ - p += n; - r += n; - r = stft_process_buffer_wrap(buf, r); - } - - buf->s_avail -= copied; - buf->s_free += copied; - buf->r_ptr = r; -} - -void stft_process_fill_fft_buffer(struct stft_process_state *state, int ch) -{ - struct stft_process_buffer *ibuf = &state->ibuf[ch]; - struct stft_process_fft *fft = &state->fft; - int32_t *prev_data = state->prev_data[ch]; - int32_t *r = ibuf->r_ptr; - int copied; - int nmax; - int idx; - int j; - int n; - - /* Copy overlapped samples from state buffer. Imaginary part of input - * remains zero. - */ - for (j = 0; j < state->prev_data_size; j++) { - fft->fft_buf[j].real = prev_data[j]; - fft->fft_buf[j].imag = 0; - } - - /* Copy hop size of new data from circular buffer */ - idx = state->prev_data_size; - for (copied = 0; copied < fft->fft_hop_size; copied += n) { - nmax = fft->fft_hop_size - copied; - n = stft_process_buffer_samples_without_wrap(ibuf, r); - n = MIN(n, nmax); - for (j = 0; j < n; j++) { - fft->fft_buf[idx].real = *r++; - fft->fft_buf[idx].imag = 0; - idx++; - } - r = stft_process_buffer_wrap(ibuf, r); - } - - ibuf->s_avail -= copied; - ibuf->s_free += copied; - ibuf->r_ptr = r; - - /* Copy for next time data back to overlap buffer */ - idx = fft->fft_hop_size; - for (j = 0; j < state->prev_data_size; j++) - prev_data[j] = fft->fft_buf[idx + j].real; -} - +#if SOF_USE_HIFI(NONE, COMP_STFT_PROCESS) void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ch) { struct stft_process_buffer *obuf = &state->obuf[ch]; @@ -400,11 +18,16 @@ void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int int i; int n; int samples_remain = fft->fft_size; - int idx = fft->fft_fill_start_idx; + int idx = 0; while (samples_remain) { n = stft_process_buffer_samples_without_wrap(obuf, w); n = MIN(samples_remain, n); + + /* Abort if n is zero to avoid infinite loop. The assert can + * trigger only with incorrect usage of this function. + */ + assert(n); for (i = 0; i < n; i++) { sample = Q_MULTSR_32X32((int64_t)state->gain_comp, fft->fft_buf[idx].real, 31, 31, 31); @@ -426,11 +49,11 @@ void stft_process_apply_window(struct stft_process_state *state) { struct stft_process_fft *fft = &state->fft; int j; - int i = fft->fft_fill_start_idx; /* Multiply Q1.31 by Q1.15 gives Q2.46, shift right by 15 to get Q2.31, no saturate need */ for (j = 0; j < fft->fft_size; j++) - fft->fft_buf[i + j].real = - sat_int32(Q_MULTSR_32X32((int64_t)fft->fft_buf[i + j].real, + fft->fft_buf[j].real = + sat_int32(Q_MULTSR_32X32((int64_t)fft->fft_buf[j].real, state->window[j], 31, 31, 31)); } +#endif /* SOF_USE_HIFI(NONE, COMP_STFT_PROCESS) */ diff --git a/src/audio/stft_process/stft_process-hifi3.c b/src/audio/stft_process/stft_process-hifi3.c new file mode 100644 index 000000000000..6cf7c3dc7e85 --- /dev/null +++ b/src/audio/stft_process/stft_process-hifi3.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2025-2026 Intel Corporation. + +/** + * \file + * \brief HiFi3 SIMD-optimized helpers for the STFT processing component. + * + * This compilation unit provides HiFi3 intrinsic versions of selected + * hot-path helpers. It is guarded by SOF_USE_MIN_HIFI(3, COMP_STFT_PROCESS) + * so only one of the generic or hifi implementations is active. + */ + +#include +#include +#include +#include +#include "stft_process.h" + +#if SOF_USE_MIN_HIFI(3, COMP_STFT_PROCESS) + +#include + +/** + * stft_process_apply_window() - Multiply FFT buffer by the analysis window. + * @state: STFT processing state that contains the FFT buffer and window. + * + * The real part of each icomplex32 sample in the FFT buffer is multiplied + * by the corresponding Q1.31 window coefficient. + */ +void stft_process_apply_window(struct stft_process_state *state) +{ + struct stft_process_fft *fft = &state->fft; + ae_int32 *buf; + const ae_int32x2 *win; + ae_f32x2 data01, data23; + ae_f32x2 win01, win23; + ae_int32x2 d0, d1; + int fft_size = fft->fft_size; + int j; + int n4; + + /* + * buf points to {real, imag} pairs (struct icomplex32). + * win points to scalar Q1.31 window coefficients. + * + * We load each complex pair, multiply only the real part by the + * window value, then store the pair back with the updated real. + * The imaginary part is left untouched. + * + * Stride for buf is sizeof(ae_int32x2) = 8 bytes per complex sample. + * Stride for win is sizeof(ae_int32) = 4 bytes per scalar window value. + */ + buf = (ae_int32 *)fft->fft_buf; + win = (const ae_int32x2 *)state->window; + + assert(!(fft_size & 3)); + + /* Main loop: process 4 samples per iteration */ + n4 = fft_size >> 2; + for (j = 0; j < n4; j++) { + /* Load four FFT real part values, combine into fft_data, + * buf[0] goes to data01 low, buf[1] goes to data01 high. + */ + d0 = AE_L32_I(buf, 0 * sizeof(ae_int32x2)); + d1 = AE_L32_I(buf, 1 * sizeof(ae_int32x2)); + data01 = AE_SEL32_HH(d0, d1); + d0 = AE_L32_I(buf, 2 * sizeof(ae_int32x2)); + d1 = AE_L32_I(buf, 3 * sizeof(ae_int32x2)); + data23 = AE_SEL32_HH(d0, d1); + + /* Load four window coefficients, + * win[0] goes to win01 low, win[1] goes to win01 high + */ + AE_L32X2_IP(win01, win, sizeof(ae_int32x2)); + AE_L32X2_IP(win23, win, sizeof(ae_int32x2)); + + /* Multiply with window function */ + data01 = AE_MULFP32X2RS(data01, win01); + data23 = AE_MULFP32X2RS(data23, win23); + + /* Store back the updated real parts */ + AE_S32_L_IP(AE_SEL32_LH(data01, data01), buf, sizeof(ae_int32x2)); + AE_S32_L_IP(data01, buf, sizeof(ae_int32x2)); + AE_S32_L_IP(AE_SEL32_LH(data23, data23), buf, sizeof(ae_int32x2)); + AE_S32_L_IP(data23, buf, sizeof(ae_int32x2)); + } +} + +/** + * stft_process_overlap_add_ifft_buffer() - Overlap-add IFFT output to circular output buffer. + * @state: STFT processing state. + * @ch: Channel index. + * + * Each IFFT output sample is multiplied by gain_comp (Q1.31 x Q1.31) and + * added with saturation to the existing content of the circular output + * buffer. HiFi3 AE_MULF32S_HH handles the multiply and + * AE_ADD32S provides the saturating accumulation. + * + * Note: obuf must be even number of samples and 64-bit aligned. + */ +void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ch) +{ + struct stft_process_buffer *obuf = &state->obuf[ch]; + struct stft_process_fft *fft = &state->fft; + ae_f32x2 gain = AE_MOVDA32(state->gain_comp); + ae_f32x2 buffer_data; + ae_f32x2 fft_data; + ae_f32x2 d0, d1; + ae_f32x2 *w = (ae_f32x2 *)obuf->w_ptr; + ae_f32 *fft_p = (ae_f32 *)fft->fft_buf; + int samples_remain = fft->fft_size; + int i, n; + + while (samples_remain) { + n = stft_process_buffer_samples_without_wrap(obuf, (int32_t *)w); + + /* The samples count must be even and not zero, the latter to avoid infinite + * loop. The assert can trigger only with incorrect usage of this function. + */ + assert(n && !(n & 1)); + n = MIN(samples_remain, n) >> 1; + for (i = 0; i < n; i++) { + /* Load two FFT real part values, combine into fft_data */ + AE_L32_IP(d0, fft_p, sizeof(ae_f32x2)); + AE_L32_IP(d1, fft_p, sizeof(ae_f32x2)); + fft_data = AE_SEL32_HH(d0, d1); + + /* Load buffer data, multiply fft_data with gain and accumulate, and + * store to output buffer. + */ + buffer_data = AE_L32X2_I(w, 0); + AE_MULAFP32X2RS(buffer_data, fft_data, gain); + AE_S32X2_IP(buffer_data, w, sizeof(ae_f32x2)); + } + w = (ae_f32x2 *)stft_process_buffer_wrap(obuf, (int32_t *)w); + samples_remain -= n << 1; + } + + obuf->w_ptr = stft_process_buffer_wrap(obuf, obuf->w_ptr + fft->fft_hop_size); + obuf->s_avail += fft->fft_hop_size; + obuf->s_free -= fft->fft_hop_size; +} + +#endif /* SOF_USE_MIN_HIFI(3, COMP_STFT_PROCESS) */ diff --git a/src/audio/stft_process/stft_process.h b/src/audio/stft_process/stft_process.h index c6b4dd6c8893..558023ba55bc 100644 --- a/src/audio/stft_process/stft_process.h +++ b/src/audio/stft_process/stft_process.h @@ -66,7 +66,6 @@ struct stft_process_fft { struct ipolar32 *fft_polar; struct fft_multi_plan *fft_plan; struct fft_multi_plan *ifft_plan; - int fft_fill_start_idx; /**< Set to 0 for pad left, etc. */ int fft_size; int fft_padded_size; int fft_hop_size; @@ -213,28 +212,128 @@ static inline int stft_process_get_config(struct processing_module *mod, uint32_ } #endif +/** + * stft_process_setup() - Initialize STFT processing state and allocate buffers. + * @mod: Pointer to processing module. + * @max_frames: Maximum number of frames per processing call. + * @rate: Audio sample rate in Hz. + * @channels: Number of audio channels. + * + * Configures FFT parameters, allocates aligned sample and FFT buffers, + * sets up window function, and creates FFT/IFFT plans based on the + * component configuration. + * + * Return: Zero on success, otherwise a negative error code. + */ int stft_process_setup(struct processing_module *mod, int max_frames, int rate, int channels); +/** + * stft_process_source_s16() - Copy S16_LE source data to STFT internal buffers. + * @cd: STFT component data. + * @source: Source for PCM samples data. + * @frames: Number of audio data frames to process. + * + * De-interleaves S16_LE audio frames from the source circular buffer + * into per-channel internal circular buffers. Each 16-bit sample is + * converted to Q1.31 format by left-shifting 16 bits. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_source_s16(struct stft_comp_data *cd, struct sof_source *source, int frames); +/** + * stft_process_sink_s16() - Copy STFT internal buffers to S16_LE sink. + * @cd: STFT component data. + * @sink: Sink for PCM samples data. + * @frames: Number of audio data frames to produce. + * + * Interleaves per-channel STFT output buffers into the sink circular + * buffer in S16_LE format. Q1.31 samples are converted to Q1.15 with + * rounding and saturation. Output buffer samples are cleared after + * reading to prepare for the next overlap-add cycle. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_sink_s16(struct stft_comp_data *cd, struct sof_sink *sink, int frames); +/** + * stft_process_source_s32() - Copy S32_LE source data to STFT internal buffers. + * @cd: STFT component data. + * @source: Source for PCM samples data. + * @frames: Number of audio data frames to process. + * + * De-interleaves S32_LE audio frames from the source circular buffer + * into per-channel internal circular buffers. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_source_s32(struct stft_comp_data *cd, struct sof_source *source, int frames); +/** + * stft_process_sink_s32() - Copy STFT internal buffers to S32_LE sink. + * @cd: STFT component data. + * @sink: Sink for PCM samples data. + * @frames: Number of audio data frames to produce. + * + * Interleaves per-channel STFT output buffers into the sink circular + * buffer in S32_LE format. The output buffer samples are cleared after + * reading to prepare for the next overlap-add cycle. + * + * Return: Zero on success, otherwise an error code. + */ int stft_process_sink_s32(struct stft_comp_data *cd, struct sof_sink *sink, int frames); +/** + * stft_process_free_buffers() - Free all STFT processing buffers. + * @mod: Pointer to processing module. + * + * Releases sample buffers, FFT buffers, and FFT/IFFT plans allocated + * during stft_process_setup(). + */ void stft_process_free_buffers(struct processing_module *mod); -void stft_process_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); - +/** + * stft_process_fill_prev_samples() - Save overlap samples for next STFT frame. + * @buf: Circular buffer to read overlap samples from. + * @prev_data: Destination array for the overlap data. + * @prev_data_length: Number of samples to copy. + * + * Copies prev_data_length samples from the circular buffer into the + * linear prev_data array, handling wrap-around as needed. + */ void stft_process_fill_prev_samples(struct stft_process_buffer *buf, int32_t *prev_data, int prev_data_length); +/** + * stft_process_fill_fft_buffer() - Assemble FFT input from overlap and new data. + * @state: STFT processing state. + * @ch: Channel index. + * + * Constructs the FFT input buffer by concatenating the previous overlap + * samples and one hop of new samples from the input circular buffer. + * Imaginary parts are set to zero. The overlap buffer is updated with + * data for the next frame. + */ void stft_process_fill_fft_buffer(struct stft_process_state *state, int ch); +/** + * stft_process_apply_window() - Multiply FFT buffer by the analysis window. + * @state: STFT processing state that contains the FFT buffer and window. + * + * The real part of each complex sample in the FFT buffer is multiplied + * by the corresponding Q1.31 window coefficient. + */ void stft_process_apply_window(struct stft_process_state *state); +/** + * stft_process_overlap_add_ifft_buffer() - Overlap-add IFFT output to circular output buffer. + * @state: STFT processing state. + * @ch: Channel index. + * + * Each IFFT output sample is multiplied by the gain compensation value + * and added with saturation to the existing content of the circular + * output buffer. + */ void stft_process_overlap_add_ifft_buffer(struct stft_process_state *state, int ch); #endif // __SOF_AUDIO_STFT_PROCESS_H__ diff --git a/src/audio/stft_process/stft_process_common.c b/src/audio/stft_process/stft_process_common.c index 2fcaeb349b84..6ab3199082de 100644 --- a/src/audio/stft_process/stft_process_common.c +++ b/src/audio/stft_process/stft_process_common.c @@ -4,6 +4,10 @@ #include #include +#include +#include +#include +#include #include #include #include @@ -17,6 +21,7 @@ #include #include #include +#include #if STFT_DEBUG extern FILE *stft_debug_fft_in_fh; @@ -36,6 +41,313 @@ static void debug_print_to_file_complex(FILE *fh, struct icomplex32 *c, int n) } #endif +#if CONFIG_FORMAT_S32LE +int stft_process_source_s32(struct stft_comp_data *cd, struct sof_source *source, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *ibuf; + int32_t const *x, *x_start, *x_end; + int x_size; + int bytes = frames * cd->frame_bytes; + int frames_left = frames; + int ret; + int n1; + int n2; + int channels = cd->channels; + int n; + int i; + int j; + + /* Get pointer to source data in circular buffer */ + ret = source_get_data_s32(source, bytes, &x, &x_start, &x_size); + if (ret) + return ret; + + /* Set helper pointers to buffer end for wrap check. Then loop until all + * samples are processed. + */ + x_end = x_start + x_size; + + while (frames_left) { + /* Find out samples to process before first wrap or end of data. */ + ibuf = &state->ibuf[0]; + n1 = (x_end - x) / cd->channels; + n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); + n = MIN(n1, n2); + n = MIN(n, frames_left); + for (i = 0; i < n; i++) { + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + *ibuf->w_ptr++ = *x++; + } + } + + /* One of the buffers needs a wrap (or end of data), so check for wrap */ + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); + } + + if (x >= x_end) + x -= x_size; + + /* Update processed samples count for next loop iteration. */ + frames_left -= n; + } + + /* Update the source for bytes consumed. Return success. */ + source_release_data(source, bytes); + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->s_avail += frames; + ibuf->s_free -= frames; + } + + return 0; +} + +int stft_process_sink_s32(struct stft_comp_data *cd, struct sof_sink *sink, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *obuf; + int32_t *y, *y_start, *y_end; + int frames_remain = frames; + int channels = cd->channels; + int bytes = frames * cd->frame_bytes; + int y_size; + int ret; + int ch, n1, n, i; + + /* Get pointer to sink data in circular buffer */ + ret = sink_get_buffer_s32(sink, bytes, &y, &y_start, &y_size); + if (ret) + return ret; + + /* Set helper pointers to buffer end for wrap check. Then loop until all + * samples are processed. + */ + y_end = y_start + y_size; + while (frames_remain) { + /* Find out samples to process before first wrap or end of data. */ + obuf = &state->obuf[0]; + n1 = (y_end - y) / cd->channels; + n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); + n = MIN(n1, n); + n = MIN(n, frames_remain); + + for (i = 0; i < n; i++) { + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + *y++ = *obuf->r_ptr; + *obuf->r_ptr++ = 0; /* clear overlap add mix */ + } + } + + /* One of the buffers needs a wrap (or end of data), so check for wrap */ + for (ch = 0; ch < cd->channels; ch++) { + obuf = &state->obuf[ch]; + obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); + } + + if (y >= y_end) + y -= y_size; + + /* Update processed samples count for next loop iteration. */ + frames_remain -= n; + } + + /* Update the sink for bytes produced. Return success. */ + sink_commit_buffer(sink, bytes); + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + obuf->s_avail -= frames; + obuf->s_free += frames; + } + + return 0; +} +#endif /* CONFIG_FORMAT_S32LE */ + +#if CONFIG_FORMAT_S16LE +int stft_process_source_s16(struct stft_comp_data *cd, struct sof_source *source, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *ibuf; + int16_t const *x, *x_start, *x_end; + int16_t in; + int x_size; + int channels = cd->channels; + int bytes = frames * cd->frame_bytes; + int frames_left = frames; + int ret; + int n1; + int n2; + int n; + int i; + int j; + + ret = source_get_data_s16(source, bytes, &x, &x_start, &x_size); + if (ret) + return ret; + + x_end = x_start + x_size; + + while (frames_left) { + ibuf = &state->ibuf[0]; + n1 = (x_end - x) / cd->channels; + n2 = stft_process_buffer_samples_without_wrap(ibuf, ibuf->w_ptr); + n = MIN(n1, n2); + n = MIN(n, frames_left); + for (i = 0; i < n; i++) { + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + in = *x++; + *ibuf->w_ptr++ = (int32_t)in << 16; + } + } + + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->w_ptr = stft_process_buffer_wrap(ibuf, ibuf->w_ptr); + } + + if (x >= x_end) + x -= x_size; + + frames_left -= n; + } + + source_release_data(source, bytes); + for (j = 0; j < channels; j++) { + ibuf = &state->ibuf[j]; + ibuf->s_avail += frames; + ibuf->s_free -= frames; + } + return 0; +} + +int stft_process_sink_s16(struct stft_comp_data *cd, struct sof_sink *sink, int frames) +{ + struct stft_process_state *state = &cd->state; + struct stft_process_buffer *obuf; + int16_t *y, *y_start, *y_end; + int frames_remain = frames; + int channels = cd->channels; + int bytes = frames * cd->frame_bytes; + int y_size; + int ret; + int ch, n1, n, i; + + ret = sink_get_buffer_s16(sink, bytes, &y, &y_start, &y_size); + if (ret) + return ret; + + y_end = y_start + y_size; + while (frames_remain) { + obuf = &state->obuf[0]; + n1 = (y_end - y) / cd->channels; + n = stft_process_buffer_samples_without_wrap(obuf, obuf->r_ptr); + n = MIN(n1, n); + n = MIN(n, frames_remain); + + for (i = 0; i < n; i++) { + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + *y++ = sat_int16(Q_SHIFT_RND(*obuf->r_ptr, 31, 15)); + *obuf->r_ptr++ = 0; /* clear overlap add mix */ + } + } + + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + obuf->r_ptr = stft_process_buffer_wrap(obuf, obuf->r_ptr); + } + + if (y >= y_end) + y -= y_size; + + frames_remain -= n; + } + + sink_commit_buffer(sink, bytes); + for (ch = 0; ch < channels; ch++) { + obuf = &state->obuf[ch]; + obuf->s_avail -= frames; + obuf->s_free += frames; + } + + return 0; +} +#endif /* CONFIG_FORMAT_S16LE */ + +void stft_process_fill_prev_samples(struct stft_process_buffer *buf, int32_t *prev_data, + int prev_data_length) +{ + int32_t *r = buf->r_ptr; + int32_t *p = prev_data; + int copied; + int nmax; + int n; + + for (copied = 0; copied < prev_data_length; copied += n) { + nmax = prev_data_length - copied; + n = stft_process_buffer_samples_without_wrap(buf, r); + n = MIN(n, nmax); + memcpy(p, r, sizeof(int32_t) * n); + p += n; + r += n; + r = stft_process_buffer_wrap(buf, r); + } + + buf->s_avail -= copied; + buf->s_free += copied; + buf->r_ptr = r; +} + +void stft_process_fill_fft_buffer(struct stft_process_state *state, int ch) +{ + struct stft_process_buffer *ibuf = &state->ibuf[ch]; + struct stft_process_fft *fft = &state->fft; + int32_t *prev_data = state->prev_data[ch]; + int32_t *r = ibuf->r_ptr; + int copied; + int nmax; + int idx; + int j; + int n; + + /* Copy overlapped samples from state buffer. Imaginary part of input + * remains zero. + */ + for (j = 0; j < state->prev_data_size; j++) { + fft->fft_buf[j].real = prev_data[j]; + fft->fft_buf[j].imag = 0; + } + + /* Copy hop size of new data from circular buffer */ + idx = state->prev_data_size; + for (copied = 0; copied < fft->fft_hop_size; copied += n) { + nmax = fft->fft_hop_size - copied; + n = stft_process_buffer_samples_without_wrap(ibuf, r); + n = MIN(n, nmax); + for (j = 0; j < n; j++) { + fft->fft_buf[idx].real = *r++; + fft->fft_buf[idx].imag = 0; + idx++; + } + r = stft_process_buffer_wrap(ibuf, r); + } + + ibuf->s_avail -= copied; + ibuf->s_free += copied; + ibuf->r_ptr = r; + + /* Copy for next time data back to overlap buffer */ + idx = fft->fft_hop_size; + for (j = 0; j < state->prev_data_size; j++) + prev_data[j] = fft->fft_buf[idx + j].real; +} + LOG_MODULE_REGISTER(stft_process_common, CONFIG_SOF_LOG_LEVEL); /* diff --git a/src/audio/stft_process/stft_process_setup.c b/src/audio/stft_process/stft_process_setup.c index 1e5b9c534205..377d5feeabc8 100644 --- a/src/audio/stft_process/stft_process_setup.c +++ b/src/audio/stft_process/stft_process_setup.c @@ -90,6 +90,8 @@ int stft_process_setup(struct processing_module *mod, int max_frames, return -EINVAL; } + /* max_frames needs to be even for buffer size allocation for Xtensa HiFi SIMD. */ + max_frames = ALIGN_UP(max_frames, 2); cd->max_frames = max_frames; state->sample_rate = sample_rate; @@ -110,25 +112,37 @@ int stft_process_setup(struct processing_module *mod, int max_frames, fft->fft_hop_size = config->frame_shift; fft->half_fft_size = (fft->fft_padded_size >> 1) + 1; + /* FFT size needs to be a multiple of 4 for Xtensa HiFi SIMD, + * and FFT hop size needs to be a multiple of 2. Check also + * for otherwise sane values. + */ + if (fft->fft_size <= 0 || fft->fft_hop_size <= 0 || + fft->fft_hop_size > fft->fft_size || + (fft->fft_size & 3) || (fft->fft_hop_size & 1)) { + comp_err(dev, "FFT size %d or hop size %d are invalid.", + fft->fft_size, fft->fft_hop_size); + return -EINVAL; + } + comp_info(dev, "fft_size = %d, fft_hop_size = %d, window = %d", fft->fft_size, fft->fft_hop_size, config->window); /* Calculated parameters */ state->prev_data_size = fft->fft_size - fft->fft_hop_size; - ibuf_size = fft->fft_hop_size + cd->max_frames; - obuf_size = fft->fft_size + cd->max_frames; + ibuf_size = fft->fft_hop_size + max_frames; + obuf_size = fft->fft_size + max_frames; prev_size = state->prev_data_size; /* Allocate buffer input samples, overlap buffer, window */ - sample_buffers_size = sizeof(int32_t) * cd->channels * - (ibuf_size + obuf_size + prev_size + fft->fft_size); + sample_buffers_size = sizeof(int32_t) * + (cd->channels * (ibuf_size + obuf_size + prev_size) + fft->fft_size); - if (sample_buffers_size > STFT_MAX_ALLOC_SIZE || sample_buffers_size < 0) { + if (sample_buffers_size > STFT_MAX_ALLOC_SIZE) { comp_err(dev, "Illegal allocation size"); - return -EINVAL; + return -ENOMEM; } - state->buffers = mod_balloc(mod, sample_buffers_size); + state->buffers = mod_balloc_align(mod, sample_buffers_size, 2 * sizeof(int32_t)); if (!state->buffers) { comp_err(dev, "Failed buffer allocate"); ret = -ENOMEM; @@ -149,14 +163,14 @@ int stft_process_setup(struct processing_module *mod, int max_frames, /* Allocate buffers for FFT input and output data */ fft->fft_buffer_size = fft->fft_padded_size * sizeof(struct icomplex32); - fft->fft_buf = mod_balloc(mod, fft->fft_buffer_size); + fft->fft_buf = mod_balloc_align(mod, fft->fft_buffer_size, sizeof(struct icomplex32)); if (!fft->fft_buf) { comp_err(dev, "Failed FFT buffer allocate"); ret = -ENOMEM; goto free_buffers; } - fft->fft_out = mod_balloc(mod, fft->fft_buffer_size); + fft->fft_out = mod_balloc_align(mod, fft->fft_buffer_size, sizeof(struct icomplex32)); if (!fft->fft_out) { comp_err(dev, "Failed FFT output allocate"); ret = -ENOMEM; @@ -166,8 +180,6 @@ int stft_process_setup(struct processing_module *mod, int max_frames, /* Share the fft_out buffer for polar format */ fft->fft_polar = (struct ipolar32 *)fft->fft_out; - fft->fft_fill_start_idx = 0; /* From config pad_type */ - /* Setup FFT */ fft->fft_plan = mod_fft_multi_plan_new(mod, fft->fft_buf, fft->fft_out, fft->fft_padded_size, 32);