add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1 @@
7dadbd644c0d7ba4733cc3726ec4cff6edf27b043725e1115861dec1609a3d28

View File

@@ -0,0 +1 @@
{"version": "1.0", "algorithm": "sha256", "created_at": "2025-08-22T07:02:16.273928+00:00", "files": [{"path": "CMakeLists.txt", "size": 1121, "hash": "b8db31748630321376c2a5998ea99d300a2d7f77d41b6a84d6f838ae18001632"}, {"path": "README.md", "size": 3606, "hash": "73f10b8cb40f463d4758f3c67e686105e8d6dea8f65a209219c7e2fac8c6cc81"}, {"path": "benchmark_esp32c5.md", "size": 6313, "hash": "af1c5eec587b0f7addac1e94c31a581a49f7121219826df53cbab973e6249786"}, {"path": "benchmark_esp32p4.md", "size": 5015, "hash": "b204be13020ff47f967c35dd1d60727013a83bbffbea2f0a1ca93874fe1e6226"}, {"path": "benchmark_esp32s3.md", "size": 5015, "hash": "9e9f5fa858453c853f83d63913dc974bd8c5fc7e5883400f7c1e50ffa6ef44a4"}, {"path": "dl_fft.h", "size": 4787, "hash": "f2cca68876cce36d24d5e0e1b9fe8bac3e09f65c5d065c5451b43713cada65c0"}, {"path": "dl_fft.hpp", "size": 9514, "hash": "abb72093b97832af83f5e0eb5289de0a02ead0325049d3192a39295bc166fd55"}, {"path": "dl_fft_f32.c", "size": 2231, "hash": "895ea2322a0740cf02bd6075453a247b5144aefceb8f31b6717248e6eeaf79ee"}, {"path": "dl_fft_s16.c", "size": 2737, "hash": "fb9168e0a6e2b798b88a8e06d5482610db9070a5b3c5e66f32ed8dcf514a0649"}, {"path": "dl_rfft.h", "size": 6523, "hash": "0219466a0cfc4ab218b22af3cf7954c975a0f4e05dd7fc947ab98c3bc3f76796"}, {"path": "dl_rfft_f32.c", "size": 3661, "hash": "1d87595dfe93be1e6ad63e2decd0b04c94f8414da8ccff96a4a9068a1128baaa"}, {"path": "dl_rfft_s16.c", "size": 3228, "hash": "7449d35f33c2acc30c4d4d09e9557c8a9e1ca9c7d5964047d248b35a8e756e63"}, {"path": "idf_component.yml", "size": 361, "hash": "2fa5bee9d2ede17e4724582ce74e654c2082416b060b7caf0f7d97df0eb0a596"}, {"path": "base/dl_fft2r_fc32_ansi.c", "size": 7479, "hash": "d41e48229ba05e2aeb07057ca56016525f8344cce00f2e38de4dd41305478590"}, {"path": "base/dl_fft2r_sc16_ansi.c", "size": 20141, "hash": "30e62e058832ce77cb1f94d587ceb99342824deca8c287fe931274d51d4906e4"}, {"path": "base/dl_fft4r_fc32_ansi.c", "size": 9025, "hash": "2cf9d2acbd80e4acd69601545baaef72a67924df1aacd7476530d01926ee3d7b"}, {"path": "base/dl_fft_base.c", "size": 1729, "hash": "25caaeecf1a7d6ba5387292fd448a0da0884615af1d95159f093ee93b701746f"}, {"path": "base/dl_fft_base.h", "size": 3270, "hash": "a28f0e60e4c5bdae7c3aa70e39920cef17ed6d8e3f425abc85922689b798d493"}, {"path": "base/dl_fft_dtype.h", "size": 427, "hash": "209a8da6f977565f2bbf80d8d8726afb1d3c9a6dbf1e48e342c1db6023066306"}, {"path": "base/isa/dl_fft_platform.h", "size": 1552, "hash": "f5d76981793e26cdbf549d25faaae589d8e1776519a84315da4d733cef190087"}, {"path": "base/isa/esp32/dl_fft2r_fc32_ae32.S", "size": 7019, "hash": "2aa2a8f98f05076b485c9e45056d9dc49fc17a200f1bdba609b01b1443a778df"}, {"path": "base/isa/esp32/dl_fft4r_fc32_ae32.S", "size": 12062, "hash": "e64fa52065bde43eb844b89c107ed3d33a930511c6b43e90e52de9853a1a32e3"}, {"path": "base/isa/esp32p4/dl_fft2r_fc32_arp4.S", "size": 5487, "hash": "25d706406cd3e7de3407ad8518790fb157097bad1a7e705a9d67736c4115f76b"}, {"path": "base/isa/esp32p4/dl_fft4r_fc32_arp4.S", "size": 11937, "hash": "16c9a05d68cda7e3b189d7c7290013bc7614aa879cc70b1e10e7c3ec4803b1ed"}, {"path": "base/isa/esp32s3/dl_fft2r_fc32_aes3.S", "size": 5901, "hash": "53e570f9b1d888cf4b36a07ea099ecd80905beb2e957c5f9d390e2a14b5d49b4"}, {"path": "base/isa/esp32s3/dl_fft4r_fc32_aes3.S", "size": 10950, "hash": "51840adc29034c0e0983439b366bfe2f10e697c1c9667e27cdcb1394905d3930"}]}

View File

@@ -0,0 +1,36 @@
idf_build_get_property(target IDF_TARGET)
set(srcs "dl_fft_f32.c"
"dl_fft_s16.c"
"dl_rfft_f32.c"
"dl_rfft_s16.c"
"dl_fft.hpp"
"base/dl_fft2r_fc32_ansi.c"
"base/dl_fft4r_fc32_ansi.c"
"base/dl_fft2r_sc16_ansi.c"
"base/dl_fft_base.c"
)
set(include_dirs "."
"base"
"base/isa"
)
if(CONFIG_IDF_TARGET_ESP32)
list(APPEND srcs "base/isa/esp32/dl_fft2r_fc32_ae32.S"
"base/isa/esp32/dl_fft4r_fc32_ae32.S" )
elseif(CONFIG_IDF_TARGET_ESP32S3)
list(APPEND srcs "base/isa/esp32s3/dl_fft2r_fc32_aes3.S"
"base/isa/esp32s3/dl_fft4r_fc32_aes3.S" )
elseif(CONFIG_IDF_TARGET_ESP32P4)
list(APPEND srcs "base/isa/esp32p4/dl_fft2r_fc32_arp4.S"
"base/isa/esp32p4/dl_fft4r_fc32_arp4.S" )
endif()
idf_component_register(SRCS ${srcs} INCLUDE_DIRS ${include_dirs})
component_compile_options(-ffast-math -O2)

View File

@@ -0,0 +1,89 @@
# DL_FFT
DL_FFT is a lightweight FFT library supporting both float32 and int16 data types.
The float FFT implementation is come from esp-dsp. And we further optimized the int16 FFT to achieving better precision.
For int16 FFT, we recommend to use `dl_fft_s16_hp_run` or `dl_rfft_s16_hp_run` interface. `hp` means "high precision".
## Get Started
### C interface
```
#include "dl_fft.h"
#include "dl_rfft.h"
// float fft
float x[nfft*2];
float *x = (float *)heap_caps_aligned_alloc(16, nfft * sizeof(float) *2, MALLOC_CAP_8BIT);
dl_fft_f32_t *fft_handle = dl_fft_f32_init(nfft, MALLOC_CAP_8BIT);
dl_fft_f32_run(fft_handle, x);
dl_ifft_f32_run(fft_handle, x);
dl_fft_f32_deinit(fft_handle);
// float rfft
float *x = (float *)heap_caps_aligned_alloc(16, nfft * sizeof(float), MALLOC_CAP_8BIT);
dl_fft_f32_t *fft_handle = dl_rfft_f32_init(nfft, MALLOC_CAP_8BIT);
dl_rfft_f32_run(fft_handle, x);
dl_irfft_f32_run(fft_handle, x);
dl_rfft_f32_deinit(fft_handle);
// int16 fft
int16_t *x= (float *)heap_caps_aligned_alloc(16, nfft * sizeof(int16_t) * 2, MALLOC_CAP_8BIT);
float *y = (float *)heap_caps_aligned_alloc(16, nfft * sizeof(float) *2, MALLOC_CAP_8BIT);
int in_exponent = -15; // float y = x * 2^in_exponent;
int fft_exponent;
int ifft_exponent;
dl_fft_s16_t *fft_handle = dl_fft_s16_init(nfft, MALLOC_CAP_8BIT);
dl_fft_s16_hp_run(fft_handle, x, in_exponent, &fft_exponent);
dl_fft_s16_hp_run(fft_handle, x, fft_exponent, &ifft_exponent);
dl_short_to_float(x, nfft, ifft_exponent, y); // convert output from int16_t to float
dl_fft_s16_deinit(fft_handle);
// int16 rfft
int16_t *x= (float *)heap_caps_aligned_alloc(16, nfft * sizeof(int16_t), MALLOC_CAP_8BIT);
float *y = (float *)heap_caps_aligned_alloc(16, nfft * sizeof(float), MALLOC_CAP_8BIT);
int in_exponent = -15; // float y = x * 2^in_exponent;
int fft_exponent;
int ifft_exponent;
dl_fft_s16_t *fft_handle = dl_rfft_s16_init(nfft, MALLOC_CAP_8BIT);
dl_rfft_s16_hp_run(fft_handle, x, in_exponent, &fft_exponent);
dl_rfft_s16_hp_run(fft_handle, x, fft_exponent, &ifft_exponent);
dl_short_to_float(x, nfft, ifft_exponent, y); // convert output from int16_t to float
dl_rfft_s16_deinit(fft_handle);
```
Please refer to [dl_fft.h](./dl_fft.h) and [dl_rfft.h](./dl_rfft.h) for more details.
> Note: The input array x must be allocated with heap_caps_aligned_alloc and aligned to 16 bytes.
## FAQ:
#### 1. Why not just use esp-dsp directly?
Because esp-dsp uses global variables to share FFT tables and other parameters in order to minimize memory consumption. This introduces significant risks for independent components. Your FFT results might be corrupted by other programs, and this is something you have little control over.
#### 2. What does dl_fft do?
1. Provides an unified and simple FFT/IFFT interface. Users no longer need to worry about their FFT results being affected by other programs. All FFT tables are allocated and released within the function scope.
2. Reimplements an int16 FFT/IFFT. Dynamic quantization is used during butterfly operations to achieve better precision.
3. [TODO] Uses built-in FFT instructions on ESP32-S3 and ESP32-P4 to further accelerate int16 FFT/IFFT.
## Benchmark
test code: [test_apps/dl_fft](https://github.com/espressif/esp-dl/tree/master/test_apps/dl_fft)
- [ESP32-S3 fft benchmark](./benchmark_esp32s3.md)
- [ESP32-P4 fft benchmark](./benchmark_esp32p4.md)
- [ESP32-C5 fft benchmark](./benchmark_esp32c5.md)
## Reference
- [esp-dsp](https://github.com/espressif/esp-dsp)
- [kissfft](https://github.com/mborgerding/kissfft)
- [fftw](https://github.com/FFTW/fftw3)

View File

@@ -0,0 +1,258 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dl_fft_base.h"
// unsigned short reverse(unsigned short x, unsigned short N, int order);
esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float *w)
{
esp_err_t result = ESP_OK;
int ie, ia, m;
float re_temp, im_temp;
float c, s;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
c = w[2 * j];
s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
re_temp = c * data[2 * m] + s * data[2 * m + 1];
im_temp = c * data[2 * m + 1] - s * data[2 * m];
data[2 * m] = data[2 * ia] - re_temp;
data[2 * m + 1] = data[2 * ia + 1] - im_temp;
data[2 * ia] = data[2 * ia] + re_temp;
data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_ifft2r_fc32_ansi(float *data, int N, float *w)
{
esp_err_t result = ESP_OK;
int ie, ia, m;
float re_temp, im_temp;
float c, s;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
c = w[2 * j];
s = -w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
re_temp = c * data[2 * m] + s * data[2 * m + 1];
im_temp = c * data[2 * m + 1] - s * data[2 * m];
data[2 * m] = data[2 * ia] - re_temp;
data[2 * m + 1] = data[2 * ia + 1] - im_temp;
data[2 * ia] = data[2 * ia] + re_temp;
data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_bitrev2r_fc32_ansi(float *data, int N, uint16_t *bitrev_table, int bitrev_size)
{
esp_err_t result = ESP_OK;
if (bitrev_table) {
float r_temp, i_temp;
for (int n = 0; n < bitrev_size; n++) {
uint16_t i = bitrev_table[n * 2];
uint16_t j = bitrev_table[n * 2 + 1];
r_temp = data[j];
data[j] = data[i];
data[i] = r_temp;
i_temp = data[j + 1];
data[j + 1] = data[i + 1];
data[i + 1] = i_temp;
}
} else {
int j, k;
float r_temp, i_temp;
j = 0;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
r_temp = data[j * 2];
data[j * 2] = data[i * 2];
data[i * 2] = r_temp;
i_temp = data[j * 2 + 1];
data[j * 2 + 1] = data[i * 2 + 1];
data[i * 2 + 1] = i_temp;
}
}
}
return result;
}
esp_err_t dl_rfft_post_proc_fc32_ansi(float *data, int N, float *table)
{
dl_fc32_t *result = (dl_fc32_t *)data;
// Original formula...
// result[0].re = result[0].re + result[0].im;
// result[N].re = result[0].re - result[0].im;
// result[0].im = 0;
// result[N].im = 0;
// Optimized one:
float tmp_re = result[0].re;
result[0].re = tmp_re + result[0].im;
result[0].im = tmp_re - result[0].im;
dl_fc32_t f1k, f2k;
for (int k = 1; k <= N / 2; k++) {
dl_fc32_t fpk = result[k];
dl_fc32_t fpnk = result[N - k];
f1k.re = fpk.re + fpnk.re;
f1k.im = fpk.im - fpnk.im;
f2k.re = fpk.re - fpnk.re;
f2k.im = fpk.im + fpnk.im;
float c = -table[k * 2 - 1];
float s = -table[k * 2 - 2];
dl_fc32_t tw;
tw.re = c * f2k.re - s * f2k.im;
tw.im = s * f2k.re + c * f2k.im;
result[k].re = 0.5 * (f1k.re + tw.re);
result[k].im = 0.5 * (f1k.im + tw.im);
result[N - k].re = 0.5 * (f1k.re - tw.re);
result[N - k].im = 0.5 * (tw.im - f1k.im);
}
return ESP_OK;
}
esp_err_t dl_rfft_pre_proc_fc32_ansi(float *data, int N, float *table)
{
dl_fc32_t *result = (dl_fc32_t *)data;
float tmp_re = result[0].re;
result[0].re = (tmp_re + result[0].im) * 0.5;
result[0].im = (tmp_re - result[0].im) * 0.5;
dl_fc32_t f1k, f2k;
for (int k = 1; k <= N / 2; k++) {
dl_fc32_t fpk = result[k];
dl_fc32_t fpnk = result[N - k];
f1k.re = fpk.re + fpnk.re;
f1k.im = fpk.im - fpnk.im;
f2k.re = fpk.re - fpnk.re;
f2k.im = fpk.im + fpnk.im;
float c = -table[k * 2 - 1];
float s = table[k * 2 - 2];
dl_fc32_t tw;
tw.re = c * f2k.re - s * f2k.im;
tw.im = s * f2k.re + c * f2k.im;
result[k].re = 0.5 * (f1k.re + tw.re);
result[k].im = 0.5 * (f1k.im + tw.im);
result[N - k].re = 0.5 * (f1k.re - tw.re);
result[N - k].im = 0.5 * (tw.im - f1k.im);
}
return ESP_OK;
}
float *dl_gen_rfft_table_f32(int fft_point, uint32_t caps)
{
float *fft_table = (float *)heap_caps_aligned_alloc(16, fft_point * sizeof(float), caps);
if (fft_table) {
for (int i = 1; i <= fft_point >> 1; i++) {
float angle = 2 * M_PI * i * 1.0 / fft_point;
fft_table[2 * i - 2] = cosf(angle);
fft_table[2 * i - 1] = sinf(angle);
}
}
return fft_table;
}
uint16_t *dl_gen_bitrev2r_table(int N, uint32_t caps, int *bitrev_size)
{
int count = 0, idx = 0;
int j = 0, k;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
count++;
}
}
if (count * 2 > UINT16_MAX) {
return NULL;
}
bitrev_size[0] = count;
uint16_t *bitrev_table = (uint16_t *)heap_caps_malloc(2 * count * sizeof(uint16_t), caps);
if (bitrev_table) {
j = 0;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
bitrev_table[idx * 2] = j * 2;
bitrev_table[idx * 2 + 1] = i * 2;
idx++;
}
}
}
return bitrev_table;
}
float *dl_gen_fftr2_table_f32(int fft_point, uint32_t caps)
{
float *fft_table = (float *)heap_caps_aligned_alloc(16, fft_point * sizeof(float), caps);
if (fft_table) {
float e = M_PI * 2.0 / fft_point;
for (int i = 0; i < (fft_point >> 1); i++) {
fft_table[2 * i] = cosf(i * e);
fft_table[2 * i + 1] = sinf(i * e);
}
dl_bitrev2r_fc32_ansi(fft_table, fft_point >> 1, NULL, 0);
}
return fft_table;
}

View File

@@ -0,0 +1,580 @@
#include "dl_fft_base.h"
static inline int16_t dl_xtfixed_bf_1(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result -= (int32_t)a1 * (int32_t)a2 + (int32_t)a3 * (int32_t)a4;
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
static inline int16_t dl_xtfixed_bf_2(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result -= ((int32_t)a1 * (int32_t)a2 - (int32_t)a3 * (int32_t)a4);
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
static inline int16_t dl_xtfixed_bf_3(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result += (int32_t)a1 * (int32_t)a2 + (int32_t)a3 * (int32_t)a4;
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
static inline int16_t dl_xtfixed_bf_4(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result += (int32_t)a1 * (int32_t)a2 - (int32_t)a3 * (int32_t)a4;
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
esp_err_t dl_fft2r_sc16_ansi(int16_t *data, int N, int16_t *table)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
cs.data = w[j];
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_ifft2r_sc16_ansi(int16_t *data, int N, int16_t *table)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
cs.data = w[j];
cs.im = -cs.im;
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_fft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m, loop_num = 2;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
shift[0] = 0;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
int loop_shift = 16;
if (loop_num == 2) {
loop_shift = dl_array_max_q_s16(data, N * 2);
if (loop_shift < 16) {
loop_shift += 1;
}
loop_num = 0;
} else {
loop_num += 1;
}
shift[0] += loop_shift - 15;
add_rount_mult = 1 << (loop_shift - 1);
for (int j = 0; j < ie; j++) {
cs.data = w[j];
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_ifft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m, loop_num = 2;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
shift[0] = 0;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
int loop_shift = 16;
if (loop_num == 2) {
loop_shift = dl_array_max_q_s16(data, N * 2);
if (loop_shift < 16) {
loop_shift += 1;
}
loop_num = 0;
} else {
loop_num += 1;
}
shift[0] += loop_shift - 15;
add_rount_mult = 1 << (loop_shift - 1);
for (int j = 0; j < ie; j++) {
cs.data = w[j];
cs.im = -cs.im;
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
static inline unsigned short reverse_sc16(unsigned short x, unsigned short N, int order)
{
unsigned short b = x;
b = (b & 0xff00) >> 8 | (b & 0x00fF) << 8;
b = (b & 0xf0F0) >> 4 | (b & 0x0f0F) << 4;
b = (b & 0xCCCC) >> 2 | (b & 0x3333) << 2;
b = (b & 0xAAAA) >> 1 | (b & 0x5555) << 1;
return b >> (16 - order);
}
esp_err_t dl_bitrev2r_sc16_ansi(int16_t *data, int N)
{
esp_err_t result = ESP_OK;
int j, k;
uint32_t temp;
uint32_t *in_data = (uint32_t *)data;
j = 0;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
temp = in_data[j];
in_data[j] = in_data[i];
in_data[i] = temp;
}
}
return result;
}
esp_err_t dl_cplx2reC_sc16(int16_t *data, int N)
{
esp_err_t result = ESP_OK;
int i;
int n2 = N << (1); // we will operate with int32 indexes
uint32_t *in_data = (uint32_t *)data;
dl_sc16_t kl;
dl_sc16_t kh;
dl_sc16_t nl;
dl_sc16_t nh;
for (i = 0; i < (N / 4); i++) {
kl.data = in_data[i + 1];
nl.data = in_data[N - i - 1];
kh.data = in_data[i + 1 + N / 2];
nh.data = in_data[N - i - 1 - N / 2];
data[i * 2 + 0 + 2] = kl.re + nl.re;
data[i * 2 + 1 + 2] = kl.im - nl.im;
data[n2 - i * 2 - 1 - N] = kh.re + nh.re;
data[n2 - i * 2 - 2 - N] = kh.im - nh.im;
data[i * 2 + 0 + 2 + N] = kl.im + nl.im;
data[i * 2 + 1 + 2 + N] = kl.re - nl.re;
data[n2 - i * 2 - 1] = kh.im + nh.im;
data[n2 - i * 2 - 2] = kh.re - nh.re;
}
data[N] = data[1];
data[1] = 0;
data[N + 1] = 0;
return result;
}
esp_err_t dl_rfft_post_proc_sc16_ansi(int16_t *data, int N, int16_t *table)
{
dl_sc16_t *result = (dl_sc16_t *)data;
// Original formula...
// result[0].re = result[0].re + result[0].im;
// result[N].re = result[0].re - result[0].im;
// result[0].im = 0;
// result[N].im = 0;
// Optimized one:
int32_t tmp_re = result[0].re + 1;
result[0].re = (tmp_re + result[0].im) >> 1;
result[0].im = (tmp_re - result[0].im) >> 1;
int round = 1 << 16;
int32_t f1k_re, f1k_im, f2k_re, f2k_im, tw_re, tw_im;
for (int k = 1; k <= N / 2; k++) {
dl_sc16_t fpk = result[k];
dl_sc16_t fpnk = result[N - k];
f1k_re = fpk.re + fpnk.re;
f1k_im = fpk.im - fpnk.im;
f2k_re = fpk.re - fpnk.re;
f2k_im = fpk.im + fpnk.im;
int16_t c = -table[k * 2 - 1];
int16_t s = -table[k * 2 - 2];
tw_re = c * f2k_re - s * f2k_im;
tw_im = s * f2k_re + c * f2k_im;
f1k_re = f1k_re << 15;
f1k_im = f1k_im << 15;
result[k].re = (f1k_re + tw_re + round) >> 17;
result[k].im = (f1k_im + tw_im + round) >> 17;
result[N - k].re = (f1k_re - tw_re + round) >> 17;
result[N - k].im = (tw_im - f1k_im + round) >> 17;
}
return ESP_OK;
}
esp_err_t dl_rfft_pre_proc_sc16_ansi(int16_t *data, int N, int16_t *table)
{
dl_sc16_t *result = (dl_sc16_t *)data;
int32_t tmp_re = result[0].re + 2;
result[0].re = (tmp_re + result[0].im) >> 2;
result[0].im = (tmp_re - result[0].im) >> 2;
int round = 1 << 16;
int32_t f1k_re, f1k_im, f2k_re, f2k_im, tw_re, tw_im;
for (int k = 1; k <= N / 2; k++) {
dl_sc16_t fpk = result[k];
dl_sc16_t fpnk = result[N - k];
f1k_re = fpk.re + fpnk.re;
f1k_im = fpk.im - fpnk.im;
f2k_re = fpk.re - fpnk.re;
f2k_im = fpk.im + fpnk.im;
int16_t c = -table[k * 2 - 1];
int16_t s = table[k * 2 - 2];
tw_re = c * f2k_re - s * f2k_im;
tw_im = s * f2k_re + c * f2k_im;
f1k_re = f1k_re << 15;
f1k_im = f1k_im << 15;
result[k].re = (f1k_re + tw_re + round) >> 17;
result[k].im = (f1k_im + tw_im + round) >> 17;
result[N - k].re = (f1k_re - tw_re + round) >> 17;
result[N - k].im = (tw_im - f1k_im + round) >> 17;
}
return ESP_OK;
}
esp_err_t dl_cplx2real_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift)
{
dl_sc16_t *result = (dl_sc16_t *)data;
// Original formula...
// result[0].re = result[0].re + result[0].im;
// result[N].re = result[0].re - result[0].im;
// result[0].im = 0;
// result[N].im = 0;
// Optimized one:
int loop_shift = dl_array_max_q_s16(data, N);
int round = 1 << loop_shift;
int32_t tmp_re = result[0].re;
shift[0] += loop_shift - 15;
if (loop_shift >= 15) {
result[0].re = (tmp_re + result[0].im) >> (loop_shift - 15);
result[0].im = (tmp_re - result[0].im) >> (loop_shift - 15);
} else {
result[0].re = (tmp_re + result[0].im) << (15 - loop_shift);
result[0].im = (tmp_re - result[0].im) << (15 - loop_shift);
}
int32_t f1k_re, f1k_im, f2k_re, f2k_im, tw_re, tw_im;
loop_shift += 1;
for (int k = 1; k <= N / 2; k++) {
dl_sc16_t fpk = result[k];
dl_sc16_t fpnk = result[N - k];
f1k_re = fpk.re + fpnk.re;
f1k_im = fpk.im - fpnk.im;
f2k_re = fpk.re - fpnk.re;
f2k_im = fpk.im + fpnk.im;
int16_t c = -table[k * 2 - 1];
int16_t s = -table[k * 2 - 2];
tw_re = c * f2k_re - s * f2k_im;
tw_im = s * f2k_re + c * f2k_im;
f1k_re = f1k_re << 15;
f1k_im = f1k_im << 15;
result[k].re = (f1k_re + tw_re + round) >> loop_shift;
result[k].im = (f1k_im + tw_im + round) >> loop_shift;
result[N - k].re = (f1k_re - tw_re + round) >> loop_shift;
result[N - k].im = (tw_im - f1k_im + round) >> loop_shift;
}
return ESP_OK;
}
int16_t *dl_gen_fft_table_sc16(int fft_point, uint32_t caps)
{
int16_t *fft_table = (int16_t *)heap_caps_aligned_alloc(16, fft_point * sizeof(int16_t), caps);
if (fft_table) {
float e = M_PI * 2.0 / fft_point;
for (int i = 0; i < (fft_point >> 1); i++) {
fft_table[2 * i] = (int16_t)roundf(INT16_MAX * cosf(i * e));
fft_table[2 * i + 1] = (int16_t)roundf(INT16_MAX * sinf(i * e));
}
dl_bitrev2r_sc16_ansi(fft_table, fft_point >> 1);
}
return fft_table;
}
int16_t *dl_gen_rfft_table_s16(int fft_point, uint32_t caps)
{
int16_t *fft_table = (int16_t *)heap_caps_aligned_alloc(16, fft_point * sizeof(int16_t), caps);
if (fft_table) {
float e = M_PI * 2.0 / fft_point;
for (int i = 0; i < (fft_point >> 1); i++) {
fft_table[2 * i] = (int16_t)roundf(INT16_MAX * cosf((i + 1) * e));
fft_table[2 * i + 1] = (int16_t)roundf(INT16_MAX * sinf((i + 1) * e));
}
}
return fft_table;
}

View File

@@ -0,0 +1,277 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dl_fft_base.h"
esp_err_t dl_fft4r_fc32_ansi(float *data, int length, float *table, int table_size)
{
dl_fc32_t bfly[4];
int log2N = dl_power_of_two(length);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
return ESP_FAIL;
}
int m = 2;
int wind_step = 2;
while (1) { /// radix 4
if (log4N == 0) {
break;
}
length = length >> 2;
for (int j = 0; j < m; j += 2) { // j: which FFT of this step
int start_index = j * (length << 1); // n: n-point FFT
dl_fc32_t *ptrc0 = (dl_fc32_t *)data + start_index;
dl_fc32_t *ptrc1 = ptrc0 + length;
dl_fc32_t *ptrc2 = ptrc1 + length;
dl_fc32_t *ptrc3 = ptrc2 + length;
dl_fc32_t *winc0 = (dl_fc32_t *)table;
dl_fc32_t *winc1 = winc0;
dl_fc32_t *winc2 = winc0;
for (int k = 0; k < length; k++) {
dl_fc32_t in0 = *ptrc0;
dl_fc32_t in2 = *ptrc2;
dl_fc32_t in1 = *ptrc1;
dl_fc32_t in3 = *ptrc3;
bfly[0].re = in0.re + in2.re + in1.re + in3.re;
bfly[0].im = in0.im + in2.im + in1.im + in3.im;
bfly[1].re = in0.re - in2.re + in1.im - in3.im;
bfly[1].im = in0.im - in2.im - in1.re + in3.re;
bfly[2].re = in0.re + in2.re - in1.re - in3.re;
bfly[2].im = in0.im + in2.im - in1.im - in3.im;
bfly[3].re = in0.re - in2.re - in1.im + in3.im;
bfly[3].im = in0.im - in2.im + in1.re - in3.re;
*ptrc0 = bfly[0];
ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
winc0 += 1 * wind_step;
winc1 += 2 * wind_step;
winc2 += 3 * wind_step;
ptrc0++;
ptrc1++;
ptrc2++;
ptrc3++;
}
}
m = m << 2;
wind_step = wind_step << 2;
log4N--;
}
return ESP_OK;
}
esp_err_t dl_ifft4r_fc32_ansi(float *data, int length, float *table, int table_size)
{
dl_fc32_t bfly[4];
int log2N = dl_power_of_two(length);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
return ESP_FAIL;
}
int m = 2;
int wind_step = 2;
while (1) { /// radix 4
if (log4N == 0) {
break;
}
length = length >> 2;
for (int j = 0; j < m; j += 2) { // j: which FFT of this step
int start_index = j * (length << 1); // n: n-point FFT
dl_fc32_t *ptrc0 = (dl_fc32_t *)data + start_index;
dl_fc32_t *ptrc1 = ptrc0 + length;
dl_fc32_t *ptrc2 = ptrc1 + length;
dl_fc32_t *ptrc3 = ptrc2 + length;
dl_fc32_t *winc0 = (dl_fc32_t *)table;
dl_fc32_t *winc1 = winc0;
dl_fc32_t *winc2 = winc0;
for (int k = 0; k < length; k++) {
dl_fc32_t in0 = *ptrc0;
dl_fc32_t in2 = *ptrc2;
dl_fc32_t in1 = *ptrc1;
dl_fc32_t in3 = *ptrc3;
bfly[0].re = in0.re + in2.re + in1.re + in3.re;
bfly[0].im = in0.im + in2.im + in1.im + in3.im;
bfly[1].re = in0.re - in2.re - in1.im + in3.im; // this fft & ifft is different
bfly[1].im = in0.im - in2.im + in1.re - in3.re; // this fft & ifft is different
bfly[2].re = in0.re + in2.re - in1.re - in3.re;
bfly[2].im = in0.im + in2.im - in1.im - in3.im;
bfly[3].re = in0.re - in2.re + in1.im - in3.im; // this fft & ifft is different
bfly[3].im = in0.im - in2.im - in1.re + in3.re; // this fft & ifft is different
*ptrc0 = bfly[0];
ptrc1->re = bfly[1].re * winc0->re - bfly[1].im * winc0->im; // this fft & ifft is different
ptrc1->im = bfly[1].im * winc0->re + bfly[1].re * winc0->im; // this fft & ifft is different
ptrc2->re = bfly[2].re * winc1->re - bfly[2].im * winc1->im; // this fft & ifft is different
ptrc2->im = bfly[2].im * winc1->re + bfly[2].re * winc1->im; // this fft & ifft is different
ptrc3->re = bfly[3].re * winc2->re - bfly[3].im * winc2->im; // this fft & ifft is different
ptrc3->im = bfly[3].im * winc2->re + bfly[3].re * winc2->im; // this fft & ifft is different
winc0 += 1 * wind_step;
winc1 += 2 * wind_step;
winc2 += 3 * wind_step;
ptrc0++;
ptrc1++;
ptrc2++;
ptrc3++;
}
}
m = m << 2;
wind_step = wind_step << 2;
log4N--;
}
return ESP_OK;
}
esp_err_t dl_bitrev4r_fc32_ansi(float *data, int N, uint16_t *bitrev_table, int bitrev_size)
{
esp_err_t result = ESP_OK;
if (bitrev_table) {
float r_temp, i_temp;
for (int n = 0; n < bitrev_size; n++) {
uint16_t i = bitrev_table[n * 2];
uint16_t j = bitrev_table[n * 2 + 1];
r_temp = data[j];
i_temp = data[j + 1];
data[j] = data[i];
data[i] = r_temp;
data[j + 1] = data[i + 1];
data[i + 1] = i_temp;
}
} else {
int log2N = dl_power_of_two(N);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
return ESP_FAIL;
}
float r_temp, i_temp;
for (int i = 0; i < N; i++) {
int cnt;
int xx;
int bits2;
xx = 0;
cnt = log4N;
int j = i;
while (cnt > 0) {
bits2 = j & 0x3;
xx = (xx << 2) + bits2;
j = j >> 2;
cnt--;
}
if (i < xx) {
r_temp = data[i * 2 + 0];
i_temp = data[i * 2 + 1];
data[i * 2 + 0] = data[xx * 2 + 0];
data[i * 2 + 1] = data[xx * 2 + 1];
data[xx * 2 + 0] = r_temp;
data[xx * 2 + 1] = i_temp;
}
}
}
return result;
}
uint16_t *dl_gen_bitrev4r_table(int N, uint32_t caps, int *bitrev_size)
{
int log2N = dl_power_of_two(N);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
bitrev_size[0] = 0;
return NULL;
}
int count = 0, idx = 0;
for (int i = 0; i < N; i++) {
int cnt;
int xx;
int bits2;
xx = 0;
cnt = log4N;
int j = i;
while (cnt > 0) {
bits2 = j & 0x3;
xx = (xx << 2) + bits2;
j = j >> 2;
cnt--;
}
if (i < xx) {
count++;
}
}
if (count * 2 > UINT16_MAX) {
return NULL;
}
bitrev_size[0] = count;
uint16_t *bitrev_table = (uint16_t *)heap_caps_malloc(2 * count * sizeof(uint16_t), caps);
if (bitrev_table) {
for (int i = 0; i < N; i++) {
int cnt;
int xx;
int bits2;
xx = 0;
cnt = log4N;
int j = i;
while (cnt > 0) {
bits2 = j & 0x3;
xx = (xx << 2) + bits2;
j = j >> 2;
cnt--;
}
if (i < xx) {
bitrev_table[idx * 2] = i * 2;
bitrev_table[idx * 2 + 1] = xx * 2;
idx++;
}
}
}
return bitrev_table;
}
float *dl_gen_fft4r_table_f32(int fft_point, uint32_t caps)
{
float *fft_table = (float *)heap_caps_aligned_alloc(16, fft_point * sizeof(float) * 2, caps);
if (fft_table) {
for (int i = 0; i < fft_point; i++) {
float angle = 2 * M_PI * i * 1.0 / fft_point;
fft_table[2 * i] = cosf(angle);
fft_table[2 * i + 1] = sinf(angle);
}
}
return fft_table;
}

View File

@@ -0,0 +1,92 @@
#include "dl_fft_base.h"
bool dl_is_power_of_two(int x)
{
return (x != 0) && ((x & (x - 1)) == 0);
}
int dl_power_of_two(uint32_t n)
{
int pos = 0;
if (n >= 1 << 16) {
n >>= 16;
pos += 16;
}
if (n >= 1 << 8) {
n >>= 8;
pos += 8;
}
if (n >= 1 << 4) {
n >>= 4;
pos += 4;
}
if (n >= 1 << 2) {
n >>= 2;
pos += 2;
}
if (n >= 1 << 1) {
pos += 1;
}
return pos;
}
float *dl_short_to_float(const int16_t *x, int len, int exponent, float *y)
{
float scale = powf(2, exponent);
// printf("scale: %f\n", scale);
for (int i = 0; i < len; i++) {
y[i] = scale * x[i];
}
return y;
}
int16_t dl_array_max_q_s16(const int16_t *x, int size)
{
int16_t max = 0;
for (int i = 1; i < size; i++) {
if (x[i] > max) {
max = x[i];
} else if (-x[i] > max) {
max = -x[i];
}
}
if (max == 0) {
return 1;
}
int16_t k = 2;
while (max > 1) {
k++;
max = max >> 1;
}
return k;
}
int dl_array_max_q_f32(const float *x, int size, float eps)
{
float max = 0;
for (int i = 1; i < size; i++) {
if (x[i] > max) {
max = x[i];
} else if (-x[i] > max) {
max = -x[i];
}
}
int max_int = ceilf(max + eps);
return dl_power_of_two(max_int);
}
int dl_float_to_short(const float *x, int len, int16_t *y, int out_exponent)
{
int exponent = out_exponent - dl_array_max_q_f32(x, len, 1e-8);
float scale = powf(2, exponent);
for (int i = 0; i < len; i++) {
y[i] = (int16_t)roundf(x[i] * scale);
}
return -exponent;
}

View File

@@ -0,0 +1,88 @@
#pragma once
#include "dl_fft_dtype.h"
#include "esp_attr.h"
#include "esp_err.h"
#include "esp_heap_caps.h"
#include "esp_log.h"
#include <math.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include "dl_fft_platform.h"
// common function
bool dl_is_power_of_two(int x);
int dl_power_of_two(uint32_t n);
float *dl_short_to_float(const int16_t *x, int len, int exponent, float *y);
int16_t dl_array_max_q_s16(const int16_t *x, int size);
int dl_float_to_short(const float *x, int len, int16_t *y, int out_exponent);
// float fftr2
float *dl_gen_fftr2_table_f32(int fft_point, uint32_t caps);
uint16_t *dl_gen_bitrev2r_table(int N, uint32_t caps, int *bitrev_size);
esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float *w);
esp_err_t dl_ifft2r_fc32_ansi(float *data, int N, float *w);
esp_err_t dl_bitrev2r_fc32_ansi(float *data, int N, uint16_t *reverse_tab, int bitrev_size);
// float fftr4
float *dl_gen_rfft_table_f32(int fft_point, uint32_t caps);
float *dl_gen_fft4r_table_f32(int fft_point, uint32_t caps);
uint16_t *dl_gen_bitrev4r_table(int N, uint32_t caps, int *bitrev_size);
esp_err_t dl_fft4r_fc32_ansi(float *data, int length, float *table, int table_size);
esp_err_t dl_ifft4r_fc32_ansi(float *data, int length, float *table, int table_size);
esp_err_t dl_bitrev4r_fc32_ansi(float *data, int N, uint16_t *reverse_tab, int bitrev_size);
esp_err_t dl_rfft_post_proc_fc32_ansi(float *data, int N, float *table);
esp_err_t dl_rfft_pre_proc_fc32_ansi(float *data, int N, float *table);
// int16 fft and rfft
int16_t *dl_gen_fft_table_sc16(int fft_point, uint32_t caps);
int16_t *dl_gen_rfft_table_s16(int fft_point, uint32_t caps);
esp_err_t dl_fft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift);
esp_err_t dl_fft2r_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_ifft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift);
esp_err_t dl_ifft2r_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_bitrev2r_sc16_ansi(int16_t *data, int N);
esp_err_t dl_rfft_post_proc_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_rfft_pre_proc_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_cplx2real_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift);
#if CONFIG_IDF_TARGET_ESP32
#define dl_fft2r_fc32 dl_fft2r_fc32_ae32_
#define dl_ifft2r_fc32 dl_ifft2r_fc32_ae32_
#define dl_fft4r_fc32 dl_fft4r_fc32_ae32_
#define dl_ifft4r_fc32 dl_ifft4r_fc32_ae32_
#elif CONFIG_IDF_TARGET_ESP32S3
#define dl_fft2r_fc32 dl_fft2r_fc32_aes3_
#define dl_ifft2r_fc32 dl_ifft2r_fc32_aes3_
#define dl_fft4r_fc32 dl_fft4r_fc32_aes3_
#define dl_ifft4r_fc32 dl_ifft4r_fc32_aes3_
#elif CONFIG_IDF_TARGET_ESP32P4
#define dl_fft2r_fc32 dl_fft2r_fc32_arp4_
#define dl_ifft2r_fc32 dl_ifft2r_fc32_arp4_
#define dl_fft4r_fc32 dl_fft4r_fc32_arp4_
#define dl_ifft4r_fc32 dl_ifft4r_fc32_arp4_
#else
#define dl_fft2r_fc32 dl_fft2r_fc32_ansi
#define dl_ifft2r_fc32 dl_ifft2r_fc32_ansi
#define dl_fft4r_fc32 dl_fft4r_fc32_ansi
#define dl_ifft4r_fc32 dl_ifft4r_fc32_ansi
#endif
#define dl_fft2r_sc16 dl_fft2r_sc16_ansi
#define dl_fft2r_sc16_hp dl_fft2r_sc16_hp_ansi
#define dl_ifft2r_sc16 dl_ifft2r_sc16_ansi
#define dl_ifft2r_sc16_hp dl_ifft2r_sc16_hp_ansi
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,30 @@
#pragma once
#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
// union to simplify access to the 16 bit data
typedef union dl_sc16_u {
struct {
int16_t re;
int16_t im;
};
uint32_t data;
} dl_sc16_t;
typedef union dl_fc32_u {
struct {
float re;
float im;
};
uint64_t data;
} dl_fc32_t;
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,36 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#if CONFIG_IDF_TARGET_ESP32
void dl_fft2r_fc32_ae32_(float *data, int N, float *table);
void dl_ifft2r_fc32_ae32_(float *data, int N, float *table);
void dl_fft4r_fc32_ae32_(float *data, int N, float *table, int table_size);
void dl_ifft4r_fc32_ae32_(float *data, int N, float *table, int table_size);
#elif CONFIG_IDF_TARGET_ESP32S3
void dl_fft2r_fc32_aes3_(float *data, int N, float *table);
void dl_ifft2r_fc32_aes3_(float *data, int N, float *table);
void dl_fft4r_fc32_aes3_(float *data, int N, float *table, int table_size);
void dl_ifft4r_fc32_aes3_(float *data, int N, float *table, int table_size);
// void test_radix2_fft_bf_s16(int16_t *data, int16_t *table, int16_t fft_point, int16_t log2n, int16_t);
// int test_radix2_fft_bf_s16_hp(int16_t *, int16_t *, int16_t, int16_t, int16_t);
// void test_radix2_bit_reverse(int16_t *data, int16_t cpx_point, int16_t log2n);
// void test_fftr_s16(int16_t *, int16_t *, int16_t);
// void test_ffti_s16(int16_t *, int16_t *, int16_t);
// void test_radix2_ifft_bf_s16(int16_t *, int16_t *, int16_t, int16_t, int16_t);
// int test_radix2_ifft_bf_s16_hp(int16_t *, int16_t *, int16_t, int16_t, int16_t);
#elif CONFIG_IDF_TARGET_ESP32P4
void dl_fft2r_fc32_arp4_(float *data, int N, float *table);
void dl_ifft2r_fc32_arp4_(float *data, int N, float *table);
void dl_fft4r_fc32_arp4_(float *data, int N, float *table, int table_size);
void dl_ifft4r_fc32_arp4_(float *data, int N, float *table, int table_size);
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,236 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.text
.align 4
.global dl_fft2r_fc32_ae32_
.type dl_fft2r_fc32_ae32_,@function
// The function implements the following C code:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N)
//{
// float *w = dl_fft_w_table_fc32;
//
// int ie, ia, m;
// float re_temp, im_temp;
// float c, s;
// int N2 = N;
// ie = 1;
// for (int N2 = N/2; N2 > 0; N2 >>= 1) {
// ia = 0;
// for (int j = 0; j < ie; j++) {
// c = w[2 * j];
// s = w[2 * j + 1];
// for (int i = 0; i < N2; i++) {
// m = ia + N2;
// re_temp = c * data[2 * m] + s * data[2 * m + 1];
// im_temp = c * data[2 * m + 1] - s * data[2 * m];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
// ia++;
// }
// ia += N2;
// }
// ie <<= 1;
// }
// return result;
//}
dl_fft2r_fc32_ae32_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.fft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
lsi f0, a10, 0
lsi f1, a10, 4
loopnez a6, .fft2r_l3
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
addx8 a13, a11, a2 // a13 - pointer for ia*2
lsi f4, a14, 0 // data[2 * m]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
lsi f5, a14, 4 // data[2 * m + 1]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
lsi f2, a13, 0 // data[2 * ia]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
lsi f3, a13, 4 // data[2 * ia + 1]
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ssi f8, a14, 0
ssi f10, a13, 0
ssi f9, a14, 4
ssi f11, a13, 4
.fft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .fft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .fft2r_l1 // Jump if > 0
// movi.n a2, 0 // return status ESP_OK
retw
.text
.align 4
.global dl_ifft2r_fc32_ae32_
.type dl_ifft2r_fc32_ae32_,@function
// The function implements the following C code:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N)
//{
// float *w = dl_fft_w_table_fc32;
//
// int ie, ia, m;
// float re_temp, im_temp;
// float c, s;
// int N2 = N;
// ie = 1;
// for (int N2 = N/2; N2 > 0; N2 >>= 1) {
// ia = 0;
// for (int j = 0; j < ie; j++) {
// c = w[2 * j];
// s = -w[2 * j + 1];
// for (int i = 0; i < N2; i++) {
// m = ia + N2;
// re_temp = c * data[2 * m] + s * data[2 * m + 1];
// im_temp = c * data[2 * m + 1] - s * data[2 * m];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
// ia++;
// }
// ia += N2;
// }
// ie <<= 1;
// }
// return result;
//}
dl_ifft2r_fc32_ae32_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.ifft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
lsi f0, a10, 0
lsi f1, a10, 4
// CHANGE: Negate the imaginary part of twiddle factors
neg.s f1, f1
loopnez a6, .ifft2r_l3
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
addx8 a13, a11, a2 // a13 - pointer for ia*2
lsi f4, a14, 0 // data[2 * m]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
lsi f5, a14, 4 // data[2 * m + 1]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
lsi f2, a13, 0 // data[2 * ia]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
lsi f3, a13, 4 // data[2 * ia + 1]
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ssi f8, a14, 0
ssi f10, a13, 0
ssi f9, a14, 4
ssi f11, a13, 4
.ifft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .ifft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .ifft2r_l1 // Jump if > 0
// movi.n a2, 0 // return status ESP_OK
retw

View File

@@ -0,0 +1,332 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.section .text # placed in IRAM instead of FLASH .text
.global dl_fft4r_fc32_ae32_
.type dl_fft4r_fc32_ae32_,@function
// The function implements the following C code:
// esp_err_t dl_fft4r_fc32_ansi_(float *data, int length, float *table, int table_size)
// {
// if (0 == dl_fft4r_initialized) {
// return ESP_ERR_DSP_UNINITIALIZED;
// }
//
// uint log2N = dl_power_of_two(length);
// if ((log2N & 0x01) != 0) {
// return ESP_ERR_DSP_INVALID_LENGTH;
// }
// uint log4N = log2N >> 1;
//
// fc32_t bfly[4];
// uint m = 2;
// uint wind_step = table_size / length;
// while (1) { ///radix 4
// if (log4N == 0) {
// break;
// }
// length = length >> 2;
// for (int j = 0; j < m; j += 2) { // j: which FFT of this step
// int start_index = j * (length << 1); // n: n-point FFT
//
// fc32_t *ptrc0 = (fc32_t *)data + start_index;
// fc32_t *ptrc1 = ptrc0 + length;
// fc32_t *ptrc2 = ptrc1 + length;
// fc32_t *ptrc3 = ptrc2 + length;
//
// fc32_t *winc0 = (fc32_t *)table;
// fc32_t *winc1 = winc0;
// fc32_t *winc2 = winc0;
//
// for (int k = 0; k < length; k++) {
// fc32_t in0 = *ptrc0;
// fc32_t in2 = *ptrc2;
// fc32_t in1 = *ptrc1;
// fc32_t in3 = *ptrc3;
//
// bfly[0].re = in0.re + in2.re + in1.re + in3.re;
// bfly[0].im = in0.im + in2.im + in1.im + in3.im;
//
// bfly[1].re = in0.re - in2.re + in1.im - in3.im;
// bfly[1].im = in0.im - in2.im - in1.re + in3.re;
//
// bfly[2].re = in0.re + in2.re - in1.re - in3.re;
// bfly[2].im = in0.im + in2.im - in1.im - in3.im;
//
// bfly[3].re = in0.re - in2.re - in1.im + in3.im;
// bfly[3].im = in0.im - in2.im + in1.re - in3.re;
//
// *ptrc0 = bfly[0];
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
//
// winc0 += 1 * wind_step;
// winc1 += 2 * wind_step;
// winc2 += 3 * wind_step;
//
// ptrc0++;
// ptrc1++;
// ptrc2++;
// ptrc3++;
// }
// }
// m = m << 2;
// wind_step = wind_step << 2;
// log4N--;
// }
// return ESP_OK;
// }
// esp_err_t dl_fft4r_fc32_ae32_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
//.ret_DSP_INVALID_LENGTH:
// movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
// retw.n
.align 4
dl_fft4r_fc32_ae32_:
entry a1, 16 # no auto vars on stack
// bltui a3, 4, .ret_DSP_INVALID_LENGTH # if N < 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
// addi.n a6, a3, -1
// and a6, a3, a6
// bnez a6, .ret_DSP_INVALID_LENGTH # if N not power of 2 : return(ESP_ERR_DSP_INVALID_LENGTH)
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
// bbsi a6, 0, .ret_DSP_INVALID_LENGTH # if N not power of 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .bf4_loop_end # for (uint k = 0; k < N; k++)
lsi f1, a12, 4 # f1 = in0.im = *(p0 + 1)
lsi f3, a14, 4 # f3 = in2.im = *(p2 + 1)
lsi f0, a12, 0 # f0 = in0.re = *p0
lsi f2, a14, 0 # f2 = in2.re = *p2
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
lsi f1, a13, 4 # f1 = in1.im = *(p1 + 1)
lsi f3, a15, 4 # f3 = in3.im = *(p3 + 1)
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f0, a13, 0 # f0 = in1.re = *p1
lsi f2, a15, 0 # f2 = in3.re = *p3
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
add.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im
sub.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
sub.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re
add.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re
ssi f5, a12, 4 # *(p0 + 1) = f5 = bf0.im
ssip f4, a12, 8 # *p0 = f4 = bf0.re , p0 += 2
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
addx4 a9, a6, a9 # w0 += m
addx8 a10, a6, a10 # w1 += 2 * m
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * m
msub.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im
madd.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im
msub.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im
madd.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im
msub.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im
madd.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im
ssi f5, a13, 4 # *(p1 + 1) = f5
ssip f4, a13, 8 # *p1 = f4, p1 += 2
ssi f9, a14, 4 # *(p2 + 1) = f9
ssip f8, a14, 8 # *p2 = f8, p2 += 2
ssi f11, a15, 4 # *(p3 + 1) = f11
ssip f10, a15, 8 # *p3 = f10, p3 += 2
.bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .stage_next # if j >= m
j .group
.stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .stage # if log4N > 0
// movi.n a2, DSP_OK # return(DSP_OK)
retw
.section .text # placed in IRAM instead of FLASH .text
.global dl_ifft4r_fc32_ae32_
.type dl_ifft4r_fc32_ae32_,@function
// esp_err_t dl_ifft4r_fc32_ae32_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
//.ret_DSP_INVALID_LENGTH:
// movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
// retw.n
.align 4
dl_ifft4r_fc32_ae32_:
entry a1, 16 # no auto vars on stack
// bltui a3, 4, .ret_DSP_INVALID_LENGTH # if N < 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
// addi.n a6, a3, -1
// and a6, a3, a6
// bnez a6, .ret_DSP_INVALID_LENGTH # if N not power of 2 : return(ESP_ERR_DSP_INVALID_LENGTH)
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
// bbsi a6, 0, .ret_DSP_INVALID_LENGTH # if N not power of 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.ifft_stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.ifft_group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .inv_bf4_loop_end # for (uint k = 0; k < N; k++)
lsi f1, a12, 4 # f1 = in0.im = *(p0 + 1)
lsi f3, a14, 4 # f3 = in2.im = *(p2 + 1)
lsi f0, a12, 0 # f0 = in0.re = *p0
lsi f2, a14, 0 # f2 = in2.re = *p2
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
lsi f1, a13, 4 # f1 = in1.im = *(p1 + 1)
lsi f3, a15, 4 # f3 = in3.im = *(p3 + 1)
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f0, a13, 0 # f0 = in1.re = *p1
lsi f2, a15, 0 # f2 = in3.re = *p3
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
sub.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im
add.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
add.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re
sub.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re
ssi f5, a12, 4 # *(p0 + 1) = f5 = bf0.im
ssip f4, a12, 8 # *p0 = f4 = bf0.re , p0 += 2
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
addx4 a9, a6, a9 # w0 += m
addx8 a10, a6, a10 # w1 += 2 * m
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * m
madd.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im
msub.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im
madd.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im
msub.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im
madd.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im
msub.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im
ssi f5, a13, 4 # *(p1 + 1) = f5
ssip f4, a13, 8 # *p1 = f4, p1 += 2
ssi f9, a14, 4 # *(p2 + 1) = f9
ssip f8, a14, 8 # *p2 = f8, p2 += 2
ssi f11, a15, 4 # *(p3 + 1) = f11
ssip f10, a15, 8 # *p3 = f10, p3 += 2
.inv_bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .ifft_stage_next # if j >= m
j .ifft_group
.ifft_stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .ifft_stage # if log4N > 0
// movi.n a2, DSP_OK # return(DSP_OK)
retw

View File

@@ -0,0 +1,153 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This is matrix multipliction function for esp32p4 processor.
.text
.align 4
.global dl_fft2r_fc32_arp4_
.type dl_fft2r_fc32_arp4_,@function
dl_fft2r_fc32_arp4_:
//esp_err_t dl_fft2r_fc32_arp4_(float *data, int N, float* dl_fft_w_table_fc32)
add sp,sp,-16
#
srli t6, a1, 1 // a6 = N2 = N/2
li t0, 1 // a7 - ie
.fft2r_l1:
li t1, 0 // a8 - j
li t4, 0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
slli t3, t1, 3 // a10 = j<<3 // shift for cos () -- c = w[2 * j];
add t3, t3, a2 // a10 - pointer to cos
flw fa0, 0(t3)
flw fa1, 4(t3)
esp.lp.setup 0, t6, .fft2r_l3 // .fft2r_l3 - label to the last executed instruction
add t5, t4, t6 // a12 = m = ia + N2
slli a4, t5, 3 // a14 - pointer for m*2
slli a3, t4, 3 // a13 - pointer for ia*2
add a4, a4, a0 // pointers to data arrays
add a3, a3, a0 //
flw fa4, 0(a4)
flw fa5, 4(a4)
flw fa2, 0(a3)
flw fa3, 4(a3)
fmul.s ft6, fa0, fa4 // re_temp = c * data[2 * m]
fmul.s ft7, fa0, fa5 // im_temp = c * data[2 * m + 1]
fmadd.s ft6, fa1, fa5, ft6 // re_temp += s * data[2 * m + 1];
fnmsub.s ft7, fa1, fa4, ft7 // im_temp -= s * data[2 * m];
fsub.s ft8, fa2, ft6 // = data[2 * ia] - re_temp;
fsub.s ft9, fa3, ft7 // = data[2 * ia + 1] - im_temp;
fadd.s ft10, fa2, ft6 // = data[2 * ia] + re_temp;
fadd.s ft11, fa3, ft7 // = data[2 * ia + 1] + im_temp;
fsw ft8, 0(a4)
fsw ft9, 4(a4)
fsw ft10, 0(a3)
fsw ft11, 4(a3)
.fft2r_l3: add t4, t4, 1 // ia++
add t4, t4, t6
add t1, t1, 1 // j++
BNE t1, t0, .fft2r_l2
slli t0, t0, 1 // ie = ie<<1
srli t6, t6, 1 // a6 = a6>>1
BNEZ t6, .fft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret
// This is matrix multipliction function for esp32p4 processor.
.text
.align 4
.global dl_ifft2r_fc32_arp4_
.type dl_ifft2r_fc32_arp4_,@function
dl_ifft2r_fc32_arp4_:
//esp_err_t dl_ifft2r_fc32_arp4_(float *data, int N, float* dl_fft_w_table_fc32)
add sp,sp,-16
#
srli t6, a1, 1 // a6 = N2 = N/2
li t0, 1 // a7 - ie
.ifft2r_l1:
li t1, 0 // a8 - j
li t4, 0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
slli t3, t1, 3 // a10 = j<<3 // shift for cos () -- c = w[2 * j];
add t3, t3, a2 // a10 - pointer to cos
flw fa0, 0(t3)
flw fa1, 4(t3)
// CHANGE: Negate the imaginary part of twiddle factors (complex conjugate)
fneg.s fa1, fa1 // s = -s (since w^-1 = w*)
esp.lp.setup 0, t6, .ifft2r_l3 // .fft2r_l3 - label to the last executed instruction
add t5, t4, t6 // a12 = m = ia + N2
slli a4, t5, 3 // a14 - pointer for m*2
slli a3, t4, 3 // a13 - pointer for ia*2
add a4, a4, a0 // pointers to data arrays
add a3, a3, a0 //
flw fa4, 0(a4)
flw fa5, 4(a4)
flw fa2, 0(a3)
flw fa3, 4(a3)
fmul.s ft6, fa0, fa4 // re_temp = c * data[2 * m]
fmul.s ft7, fa0, fa5 // im_temp = c * data[2 * m + 1]
fmadd.s ft6, fa1, fa5, ft6 // re_temp += s * data[2 * m + 1];
fnmsub.s ft7, fa1, fa4, ft7 // im_temp -= s * data[2 * m];
fsub.s ft8, fa2, ft6 // = data[2 * ia] - re_temp;
fsub.s ft9, fa3, ft7 // = data[2 * ia + 1] - im_temp;
fadd.s ft10, fa2, ft6 // = data[2 * ia] + re_temp;
fadd.s ft11, fa3, ft7 // = data[2 * ia + 1] + im_temp;
fsw ft8, 0(a4)
fsw ft9, 4(a4)
fsw ft10, 0(a3)
fsw ft11, 4(a3)
.ifft2r_l3: add t4, t4, 1 // ia++
add t4, t4, t6
add t1, t1, 1 // j++
BNE t1, t0, .ifft2r_l2
slli t0, t0, 1 // ie = ie<<1
srli t6, t6, 1 // a6 = a6>>1
BNEZ t6, .ifft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret

View File

@@ -0,0 +1,304 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
.align 4
.global dl_fft4r_fc32_arp4_
.type dl_fft4r_fc32_arp4_,@function
dl_fft4r_fc32_arp4_:
//esp_err_t dl_fft4r_fc32_arp4_(float *data, int N, float *table, int table_size)
// table_size - a3
// m - t0
// j - t1
add sp,sp,-16
#
srli t6, a1, 1 // t6 = log4N = N/2
li t0, 2 // t0 - m
div a3, a3, a1 // wind_step = table_size / N
slli a3, a3, 3 // wind_step = complex step = 8 bytes
.fft2r_l1:
li t1, 0 // t1 - j
srli a1, a1, 2 // a1 = length = length >> 2;
.fft2r_l2: // loop for j, t1 - j
slli t2, a1, 4 // t2 = length << 1 << 3 (8 bytes for one complex sample)
slli t3, a1, 3 // t2 = length << 1 << 3 (8 bytes for one complex sample)
// start_index = j * (length << 1); // n: n-point FFT
mul t2,t2,t1
add a4, a0, t2 // fc32_t *ptrc0
add a5, a4, t3 // fc32_t *ptrc1
add a6, a5, t3 // fc32_t *ptrc2
add a7, a6, t3 // fc32_t *ptrc3
# flw fa0, 0(a4)
# fsw fa0, 0(t3)
# add t3, t3, 4
mv t2, a2 // winc0
mv t3, a2 // winc0
mv t4, a2 // winc0
esp.lp.setup 0, a1, .fft2r_l3 // .fft2r_l3 - label to the last executed instruction
flw fa0, 0(a4) // in0.re
flw fa4, 0(a6) // in2.re
fadd.s ft0, fa0, fa4 // in0.re + in2.re
flw fa1, 4(a4) // in0.im
fsub.s ft1, fa0, fa4 // in0.re - in2.re
flw fa5, 4(a6) // in2.im
fadd.s ft2, fa1, fa5 // in0.im + in2.im
flw fa2, 0(a5) // in1.re
fsub.s ft3, fa1, fa5 // in0.im - in2.im
flw fa6, 0(a7) // in3.re
fadd.s ft4, fa2, fa6 // in1.re + in3.re
flw fa3, 4(a5) // in1.im
fsub.s ft5, fa2, fa6 // in1.re - in3.re
flw fa7, 4(a7) // in3.im
fadd.s ft6, fa3, fa7 // in1.im + in3.im
fsub.s ft7, fa3, fa7 // in1.im - in3.im
# bfly[0].re = ft0 + ft4;
fadd.s fa0, ft0, ft4;
# bfly[0].im = ft2 + ft6;
fadd.s fa1, ft2, ft6;
# bfly[1].re = ft1 + ft7;
fadd.s fa2, ft1, ft7;
# bfly[1].im = ft3 - ft5;
fsub.s fa3, ft3, ft5;
# bfly[2].re = ft0 - ft5;
fsub.s fa4, ft0, ft4;
flw ft0, 0(t2) // winc0->re
# bfly[2].im = ft2 - ft7;
fsub.s fa5, ft2, ft6;
flw ft2, 0(t3) // winc1->re
# bfly[3].re = ft1 - ft6;
fsub.s fa6, ft1, ft7;
flw ft1, 4(t2) // winc0->im
# bfly[3].im = ft3 + ft5;
fadd.s fa7, ft3, ft5;
// *ptrc0 = bfly[0];
fsw fa0, 0(a4) // in0.re
fsw fa1, 4(a4) // in0.im
flw ft3, 4(t3) // winc1->im
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
fmul.s fa0, fa2, ft0
add t2, t2, a3 // winc0 += 1 * wind_step;
fmul.s fa1, fa3, ft0
fmul.s ft0, fa4, ft2
fmul.s ft2, fa5, ft2
flw ft4, 0(t4) // winc2->re
flw ft5, 4(t4) // winc3->im
fmadd.s fa0, fa3, ft1, fa0
add t3, t3, a3 // winc1 += 2 * wind_step;
fnmsub.s fa1, fa2, ft1, fa1
add t3, t3, a3 //
fmul.s fa2, fa6, ft4
fmul.s fa3, fa7, ft4
add t4, t4, a3 // winc2 += 3 * wind_step;
fmadd.s ft0, fa5, ft3, ft0
add t4, t4, a3 //
fnmsub.s ft2, fa4, ft3, ft2
fmadd.s ft3, fa7, ft5, fa2
add t4, t4, a3 //
fnmsub.s fa3, fa6, ft5, fa3
fsw fa0, 0(a5) // in1.re
add a4, a4, 8
fsw fa1, 4(a5) // in1.im
add a5, a5, 8
fsw ft0, 0(a6) // in2.re
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
fsw ft2, 4(a6) // in2.re
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
add a6, a6, 8
fsw ft3, 0(a7) // in2.re
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
fsw fa3, 4(a7) // in2.re
add a7, a7, 8
// Temp solution
.fft2r_l3: nop
add t1, t1, 2 // j+=2
BNE t1, t0, .fft2r_l2
slli t0, t0, 2 // t0 = m = m<<2
srli t6, t6, 2 // t6 = log4N >>= 2
slli a3, a3, 2 // wind_step = wind_step << 2;
BNEZ t6, .fft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret
.text
.align 4
.global dl_ifft4r_fc32_arp4_
.type dl_ifft4r_fc32_arp4_,@function
dl_ifft4r_fc32_arp4_:
//esp_err_t dl_ifft4r_fc32_arp4_(float *data, int N, float *table, int table_size)
// table_size - a3
// m - t0
// j - t1
add sp,sp,-16
#
srli t6, a1, 1 // t6 = log4N = N/2
li t0, 2 // t0 - m
div a3, a3, a1 // wind_step = table_size / N
slli a3, a3, 3 // wind_step = complex step = 8 bytes
.ifft2r_l1:
li t1, 0 // t1 - j
srli a1, a1, 2 // a1 = length = length >> 2;
.ifft2r_l2: // loop for j, t1 - j
slli t2, a1, 4 // t2 = length << 1 << 3 (8 bytes for one complex sample)
slli t3, a1, 3 // t2 = length << 1 << 3 (8 bytes for one complex sample)
// start_index = j * (length << 1); // n: n-point FFT
mul t2,t2,t1
add a4, a0, t2 // fc32_t *ptrc0
add a5, a4, t3 // fc32_t *ptrc1
add a6, a5, t3 // fc32_t *ptrc2
add a7, a6, t3 // fc32_t *ptrc3
# flw fa0, 0(a4)
# fsw fa0, 0(t3)
# add t3, t3, 4
mv t2, a2 // winc0
mv t3, a2 // winc0
mv t4, a2 // winc0
esp.lp.setup 0, a1, .ifft2r_l3 // .ifft2r_l3 - label to the last executed instruction
flw fa0, 0(a4) // in0.re
flw fa4, 0(a6) // in2.re
fadd.s ft0, fa0, fa4 // in0.re + in2.re
flw fa1, 4(a4) // in0.im
fsub.s ft1, fa0, fa4 // in0.re - in2.re
flw fa5, 4(a6) // in2.im
fadd.s ft2, fa1, fa5 // in0.im + in2.im
flw fa2, 0(a5) // in1.re
fsub.s ft3, fa1, fa5 // in0.im - in2.im
flw fa6, 0(a7) // in3.re
fadd.s ft4, fa2, fa6 // in1.re + in3.re
flw fa3, 4(a5) // in1.im
fsub.s ft5, fa2, fa6 // in1.re - in3.re
flw fa7, 4(a7) // in3.im
fadd.s ft6, fa3, fa7 // in1.im + in3.im
fsub.s ft7, fa3, fa7 // in1.im - in3.im
# bfly[0].re = ft0 + ft4;
fadd.s fa0, ft0, ft4;
# bfly[0].im = ft2 + ft6;
fadd.s fa1, ft2, ft6;
# bfly[1].re = ft1 - ft7;
fsub.s fa2, ft1, ft7;
# bfly[1].im = ft3 + ft5;
fadd.s fa3, ft3, ft5;
# bfly[2].re = ft0 - ft5;
fsub.s fa4, ft0, ft4;
flw ft0, 0(t2) // winc0->re
# bfly[2].im = ft2 - ft7;
fsub.s fa5, ft2, ft6;
flw ft2, 0(t3) // winc1->re
# bfly[3].re = ft1 + ft6;
fadd.s fa6, ft1, ft7;
flw ft1, 4(t2) // winc0->im
# bfly[3].im = ft3 - ft5;
fsub.s fa7, ft3, ft5;
// *ptrc0 = bfly[0];
fsw fa0, 0(a4) // in0.re
fsw fa1, 4(a4) // in0.im
flw ft3, 4(t3) // winc1->im
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
fmul.s fa0, fa2, ft0
add t2, t2, a3 // winc0 += 1 * wind_step;
fmul.s fa1, fa3, ft0
fmul.s ft0, fa4, ft2
fmul.s ft2, fa5, ft2
flw ft4, 0(t4) // winc2->re
flw ft5, 4(t4) // winc3->im
fnmsub.s fa0, fa3, ft1, fa0
add t3, t3, a3 // winc1 += 2 * wind_step;
fmadd.s fa1, fa2, ft1, fa1
add t3, t3, a3 //
fmul.s fa2, fa6, ft4
fmul.s fa3, fa7, ft4
add t4, t4, a3 // winc2 += 3 * wind_step;
fnmsub.s ft0, fa5, ft3, ft0
add t4, t4, a3 //
fmadd.s ft2, fa4, ft3, ft2
fnmsub.s ft3, fa7, ft5, fa2
add t4, t4, a3 //
fmadd.s fa3, fa6, ft5, fa3
fsw fa0, 0(a5) // in1.re
add a4, a4, 8
fsw fa1, 4(a5) // in1.im
add a5, a5, 8
fsw ft0, 0(a6) // in2.re
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
fsw ft2, 4(a6) // in2.re
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
add a6, a6, 8
fsw ft3, 0(a7) // in2.re
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
fsw fa3, 4(a7) // in2.re
add a7, a7, 8
// Temp solution
.ifft2r_l3: nop
add t1, t1, 2 // j+=2
BNE t1, t0, .ifft2r_l2
slli t0, t0, 2 // t0 = m = m<<2
srli t6, t6, 2 // t6 = log4N >>= 2
slli a3, a3, 2 // wind_step = wind_step << 2;
BNEZ t6, .ifft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret

View File

@@ -0,0 +1,197 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.text
.align 4
.global dl_fft2r_fc32_aes3_
.type dl_fft2r_fc32_aes3_,@function
// The function implements the following C code:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N)
//{
// float *w = dl_fft_w_table_fc32;
//
// int ie, ia, m;
// float re_temp, im_temp;
// float c, s;
// int N2 = N;
// ie = 1;
// for (int N2 = N/2; N2 > 0; N2 >>= 1) {
// ia = 0;
// for (int j = 0; j < ie; j++) {
// c = w[2 * j];
// s = w[2 * j + 1];
// for (int i = 0; i < N2; i++) {
// m = ia + N2;
// re_temp = c * data[2 * m] + s * data[2 * m + 1];
// im_temp = c * data[2 * m + 1] - s * data[2 * m];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
// ia++;
// }
// ia += N2;
// }
// ie <<= 1;
// }
// return result;
//}
dl_fft2r_fc32_aes3_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.ifft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
ee.ldf.64.ip f1, f0, a10, 0
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
loopnez a6, .ifft2r_l3
ee.ldf.64.ip f5, f4, a14, 0 // data[2 * m], data[2 * m + 1]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
addx8 a13, a11, a2 // a13 - pointer for ia*2
ee.ldf.64.ip f3, f2, a13, 0 // data[2 * ia], data[2 * ia + 1]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
add.n a12, a11, a6 // a12 = m = ia + N2
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ee.stf.64.ip f9, f8, a14, 0
addx8 a14, a12, a2 // a14 - pointer for m*2
ee.stf.64.ip f11, f10, a13, 0
.ifft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .ifft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .ifft2r_l1 // Jump if > 0
retw
.text
.align 4
.global dl_ifft2r_fc32_aes3_
.type dl_ifft2r_fc32_aes3_,@function
dl_ifft2r_fc32_aes3_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.fft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
ee.ldf.64.ip f1, f0, a10, 0
// CHANGE: Negate the imaginary part of twiddle factors
neg.s f1, f1
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
loopnez a6, .fft2r_l3
ee.ldf.64.ip f5, f4, a14, 0 // data[2 * m], data[2 * m + 1]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
addx8 a13, a11, a2 // a13 - pointer for ia*2
ee.ldf.64.ip f3, f2, a13, 0 // data[2 * ia], data[2 * ia + 1]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
add.n a12, a11, a6 // a12 = m = ia + N2
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ee.stf.64.ip f9, f8, a14, 0
addx8 a14, a12, a2 // a14 - pointer for m*2
ee.stf.64.ip f11, f10, a13, 0
.fft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .fft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .fft2r_l1 // Jump if > 0
retw

View File

@@ -0,0 +1,288 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.section .text
.global dl_fft4r_fc32_aes3_
.type dl_fft4r_fc32_aes3_,@function
// The function implements the following C code:
// esp_err_t dl_fft4r_fc32_ansi_(float *data, int length, float *table, int table_size)
// {
// if (0 == dl_fft4r_initialized) {
// return ESP_ERR_DSP_UNINITIALIZED;
// }
//
// uint log2N = dl_power_of_two(length);
// if ((log2N & 0x01) != 0) {
// return ESP_ERR_DSP_INVALID_LENGTH;
// }
// uint log4N = log2N >> 1;
//
// fc32_t bfly[4];
// uint m = 2;
// uint wind_step = table_size / length;
// while (1) { ///radix 4
// if (log4N == 0) {
// break;
// }
// length = length >> 2;
// for (int j = 0; j < m; j += 2) { // j: which FFT of this step
// int start_index = j * (length << 1); // n: n-point FFT
//
// fc32_t *ptrc0 = (fc32_t *)data + start_index;
// fc32_t *ptrc1 = ptrc0 + length;
// fc32_t *ptrc2 = ptrc1 + length;
// fc32_t *ptrc3 = ptrc2 + length;
//
// fc32_t *winc0 = (fc32_t *)table;
// fc32_t *winc1 = winc0;
// fc32_t *winc2 = winc0;
//
// for (int k = 0; k < length; k++) {
// fc32_t in0 = *ptrc0;
// fc32_t in2 = *ptrc2;
// fc32_t in1 = *ptrc1;
// fc32_t in3 = *ptrc3;
//
// bfly[0].re = in0.re + in2.re + in1.re + in3.re;
// bfly[0].im = in0.im + in2.im + in1.im + in3.im;
//
// bfly[1].re = in0.re - in2.re + in1.im - in3.im;
// bfly[1].im = in0.im - in2.im - in1.re + in3.re;
//
// bfly[2].re = in0.re + in2.re - in1.re - in3.re;
// bfly[2].im = in0.im + in2.im - in1.im - in3.im;
//
// bfly[3].re = in0.re - in2.re - in1.im + in3.im;
// bfly[3].im = in0.im - in2.im + in1.re - in3.re;
//
// *ptrc0 = bfly[0];
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
//
// winc0 += 1 * wind_step;
// winc1 += 2 * wind_step;
// winc2 += 3 * wind_step;
//
// ptrc0++;
// ptrc1++;
// ptrc2++;
// ptrc3++;
// }
// }
// m = m << 2;
// wind_step = wind_step << 2;
// log4N--;
// }
// return ESP_OK;
// }
// esp_err_t dl_fft4r_fc32_aes3_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
.align 4
dl_fft4r_fc32_aes3_:
entry a1, 16 # no auto vars on stack
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .bf4_loop_end # for (uint k = 0; k < N; k++)
ee.ldf.64.ip f1, f0, a12, 0 # f0 = in0.re = *p0, f1 = in0.im = *(p0 + 1)
ee.ldf.64.ip f3, f2, a14, 0 # f2 = in2.re = *p2, f3 = in2.im = *(p2 + 1)
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
ee.ldf.64.ip f1, f0, a13, 0 # f0 = in1.re = *p1, f1 = in1.im = *(p1 + 1)
ee.ldf.64.ip f3, f2, a15, 0 # f2 = in3.re = *p3, f3 = in3.im = *(p3 + 1)
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
add.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im
sub.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
sub.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re
add.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
ee.stf.64.ip f5, f4, a12, 8 # *p0 = f4 = bf0.re, *(p0 + 1) = f5 = bf0.im, p0 += 2
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
msub.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im
madd.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im
msub.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im
madd.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im
msub.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im
madd.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im
addx4 a9, a6, a9 # w0 += w_step
addx8 a10, a6, a10 # w1 += 2 * w_step
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * w_step
ee.stf.64.ip f5, f4, a13, 8 # *p1 = f4, *(p1 + 1) = f5, p1 += 2
ee.stf.64.ip f9, f8, a14, 8 # *p2 = f8, *(p2 + 1) = f9, p2 += 2
ee.stf.64.ip f11, f10, a15, 8 # *p3 = f10, *(p3 + 1) = f11, p3 += 2
.bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .stage_next # if j >= m
j .group
.stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .stage # if log4N > 0
retw
.section .text
.global dl_ifft4r_fc32_aes3_
.type dl_ifft4r_fc32_aes3_,@function
// esp_err_t dl_ifft4r_fc32_aes3_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
.align 4
dl_ifft4r_fc32_aes3_:
entry a1, 16 # no auto vars on stack
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.ifft_stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.ifft_group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .inv_bf4_loop_end # for (uint k = 0; k < N; k++)
ee.ldf.64.ip f1, f0, a12, 0 # f0 = in0.re = *p0, f1 = in0.im = *(p0 + 1)
ee.ldf.64.ip f3, f2, a14, 0 # f2 = in2.re = *p2, f3 = in2.im = *(p2 + 1)
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
ee.ldf.64.ip f1, f0, a13, 0 # f0 = in1.re = *p1, f1 = in1.im = *(p1 + 1)
ee.ldf.64.ip f3, f2, a15, 0 # f2 = in3.re = *p3, f3 = in3.im = *(p3 + 1)
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
sub.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im //ifft change
add.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im //ifft change
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
add.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re //ifft change
sub.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re //ifft change
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
ee.stf.64.ip f5, f4, a12, 8 # *p0 = f4 = bf0.re, *(p0 + 1) = f5 = bf0.im, p0 += 2
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
madd.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im //ifft change
msub.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im //ifft change
madd.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im //ifft change
msub.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im //ifft change
madd.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im //ifft change
msub.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im //ifft change
addx4 a9, a6, a9 # w0 += w_step
addx8 a10, a6, a10 # w1 += 2 * w_step
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * w_step
ee.stf.64.ip f5, f4, a13, 8 # *p1 = f4, *(p1 + 1) = f5, p1 += 2
ee.stf.64.ip f9, f8, a14, 8 # *p2 = f8, *(p2 + 1) = f9, p2 += 2
ee.stf.64.ip f11, f10, a15, 8 # *p3 = f10, *(p3 + 1) = f11, p3 += 2
.inv_bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .ifft_stage_next # if j >= m
j .ifft_group
.ifft_stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .ifft_stage # if log4N > 0
retw

View File

@@ -0,0 +1,65 @@
## ESP32-C5 fft benchmark:
| Test Name | Size | SNR (dB) | RMSE | Time (μs) | Test Time (ms) |
|-----------------------|----------|--------------|------------|---------------|--------------------|
| 1. test dl fft | 128 | 105 | 0.000316 | 1220 | 703 |
| | 256 | 108 | 0.000316 | 2801 | |
| | 512 | 111 | 0.000316 | 6323 | |
| | 1024 | 114 | 0.000316 | 14083 | |
| | 2048 | 117 | 0.000316 | 31060 | |
| 2. test dl ifft | 128 | 85.7 | 0.000316 | 1278 | 772 |
| | 256 | 85.4 | 0.000316 | 2919 | |
| | 512 | 85.4 | 0.000316 | 6548 | |
| | 1024 | 85.4 | 0.000316 | 14537 | |
| | 2048 | 85.2 | 0.000316 | 31963 | |
| 3. test dl rfft | 128 | 101 | 0.000316 | 621 | 449 |
| | 256 | 105 | 0.000316 | 1534 | |
| | 512 | 108 | 0.000316 | 3119 | |
| | 1024 | 110 | 0.000316 | 7577 | |
| | 2048 | 114 | 0.000316 | 15088 | |
| 4. test dl irfft | 128 | 85.5 | 0.000316 | 668 | 491 |
| | 256 | 85.9 | 0.000316 | 1630 | |
| | 512 | 84.5 | 0.000316 | 3305 | |
| | 1024 | 85.3 | 0.000316 | 7948 | |
| | 2048 | 85.4 | 0.000316 | 15804 | |
| 5. test dl fft s16 | 128 | 65.9 | 0.001719 | 87 | 139 |
| | 256 | 62.0 | 0.003524 | 194 | |
| | 512 | 59.2 | 0.006614 | 429 | |
| | 1024 | 56.8 | 0.013190 | 938 | |
| | 2048 | 53.7 | 0.026223 | 2038 | |
| 6. test dl ifft s16 | 128 | 59.8 | 0.000527 | 89 | 143 |
| | 256 | 52.2 | 0.000902 | 198 | |
| | 512 | 51.4 | 0.000917 | 435 | |
| | 1024 | 51.1 | 0.000960 | 951 | |
| | 2048 | 45.7 | 0.001737 | 2063 | |
| 7. test dl fft hp s16 | 128 | 76.1 | 0.000621 | 111 | 149 |
| | 256 | 73.6 | 0.000975 | 248 | |
| | 512 | 72.6 | 0.001546 | 537 | |
| | 1024 | 73.0 | 0.002084 | 1188 | |
| | 2048 | 69.9 | 0.004387 | 2545 | |
| 8. test dl ifft hp s16 | 128 | 72.6 | 0.000327 | 112 | 158 |
| | 256 | 71.5 | 0.000328 | 251 | |
| | 512 | 68.9 | 0.000334 | 542 | |
| | 1024 | 68.9 | 0.000335 | 1199 | |
| | 2048 | 67.8 | 0.000345 | 2566 | |
| 9. test dl rfft s16 | 128 | 63.8 | 0.001403 | 45 | 130 |
| | 256 | 60.7 | 0.002885 | 99 | |
| | 512 | 58.2 | 0.005433 | 218 | |
| | 1024 | 54.5 | 0.011284 | 477 | |
| | 2048 | 51.9 | 0.022304 | 1034 | |
| 10. test dl irfft s16 | 128 | 57.8 | 0.000596 | 45 | 134 |
| | 256 | 52.2 | 0.000937 | 100 | |
| | 512 | 50.2 | 0.000984 | 220 | |
| | 1024 | 45.6 | 0.001740 | 480 | |
| | 2048 | 40.2 | 0.003298 | 1041 | |
| 11. test dl rfft hp s16 | 128 | 75.7 | 0.000464 | 57 | 135 |
| | 256 | 74.2 | 0.000730 | 123 | |
| | 512 | 72.7 | 0.001103 | 272 | |
| | 1024 | 73.0 | 0.001432 | 585 | |
| | 2048 | 70.3 | 0.002952 | 1284 | |
| 12. test dl irfft hp s16 | 128 | 72.8 | 0.000324 | 58 | 139 |
| | 256 | 71.1 | 0.000330 | 123 | |
| | 512 | 67.8 | 0.000338 | 273 | |
| | 1024 | 69.5 | 0.000335 | 587 | |
| | 2048 | 67.4 | 0.000346 | 1289 | |

View File

@@ -0,0 +1,65 @@
## ESP32-P4 fft benchmark:
| Test Name | Size | SNR (dB) | RMSE | Time (μs) |
|-----------------------|----------|--------------|------------|---------------|
| 1. test dl fft | 128 | 104.868538 | 0.000316 | 38 |
| | 256 | 107.637619 | 0.000316 | 85 |
| | 512 | 110.548630 | 0.000316 | 188 |
| | 1024 | 113.582832 | 0.000316 | 415 |
| | 2048 | 116.905914 | 0.000316 | 904 |
| 2. test dl ifft | 128 | 85.701355 | 0.000316 | 46 |
| | 256 | 85.375244 | 0.000316 | 99 |
| | 512 | 85.372276 | 0.000316 | 217 |
| | 1024 | 85.351921 | 0.000316 | 471 |
| | 2048 | 85.206238 | 0.000316 | 1017 |
| 3. test dl rfft | 128 | 101.360046 | 0.000316 | 18 |
| | 256 | 105.289742 | 0.000316 | 44 |
| | 512 | 107.978775 | 0.000316 | 88 |
| | 1024 | 110.488144 | 0.000316 | 212 |
| | 2048 | 113.904335 | 0.000316 | 416 |
| 4. test dl irfft | 128 | 85.467148 | 0.000316 | 22 |
| | 256 | 85.928154 | 0.000316 | 52 |
| | 512 | 84.540436 | 0.000316 | 102 |
| | 1024 | 85.282562 | 0.000316 | 244 |
| | 2048 | 85.383667 | 0.000316 | 465 |
| 5. test dl fft s16 | 128 | 65.917183 | 0.001719 | 60 |
| | 256 | 61.950771 | 0.003524 | 135 |
| | 512 | 59.240242 | 0.006614 | 299 |
| | 1024 | 56.814144 | 0.013190 | 654 |
| | 2048 | 53.681591 | 0.026223 | 1422 |
| 6. test dl ifft s16 | 128 | 59.837440 | 0.000527 | 61 |
| | 256 | 52.158340 | 0.000902 | 137 |
| | 512 | 51.414349 | 0.000917 | 303 |
| | 1024 | 51.119301 | 0.000960 | 663 |
| | 2048 | 45.654255 | 0.001737 | 1439 |
| 7. test dl fft hp s16 | 128 | 76.132126 | 0.000621 | 79 |
| | 256 | 73.598412 | 0.000975 | 177 |
| | 512 | 72.596603 | 0.001546 | 384 |
| | 1024 | 73.045952 | 0.002084 | 853 |
| | 2048 | 69.902023 | 0.004387 | 1826 |
| 8. test dl ifft hp s16 | 128 | 72.633217 | 0.000327 | 80 |
| | 256 | 71.462891 | 0.000328 | 180 |
| | 512 | 68.908401 | 0.000334 | 389 |
| | 1024 | 68.920097 | 0.000335 | 862 |
| | 2048 | 67.777245 | 0.000345 | 1842 |
| 9. test dl rfft s16 | 128 | 63.782593 | 0.001403 | 32 |
| | 256 | 60.652668 | 0.002885 | 70 |
| | 512 | 58.204708 | 0.005433 | 154 |
| | 1024 | 54.490803 | 0.011284 | 337 |
| | 2048 | 51.854618 | 0.022304 | 730 |
| 10. test dl irfft s16 | 128 | 57.822262 | 0.000596 | 32 |
| | 256 | 52.207390 | 0.000937 | 71 |
| | 512 | 50.153603 | 0.000984 | 155 |
| | 1024 | 45.564911 | 0.001740 | 338 |
| | 2048 | 40.217754 | 0.003298 | 733 |
| 11. test dl rfft hp s16 | 128 | 75.728333 | 0.000464 | 41 |
| | 256 | 74.201035 | 0.000730 | 88 |
| | 512 | 72.743904 | 0.001103 | 196 |
| | 1024 | 72.959915 | 0.001432 | 422 |
| | 2048 | 70.298073 | 0.002952 | 928 |
| 12. test dl irfft hp s16 | 128 | 72.830231 | 0.000324 | 41 |
| | 256 | 71.144485 | 0.000330 | 89 |
| | 512 | 67.758896 | 0.000338 | 198 |
| | 1024 | 69.508110 | 0.000335 | 424 |
| | 2048 | 67.428802 | 0.000346 | 933 |

View File

@@ -0,0 +1,65 @@
## ESP32-S3 fft benchmark:
| Test Name | Size | SNR (dB) | RMSE | Time (μs) |
|-----------------------|----------|--------------|------------|---------------|
| 1. test dl fft | 128 | 105 | 0.000316 | 61 |
| | 256 | 108 | 0.000316 | 136 |
| | 512 | 111 | 0.000316 | 299 |
| | 1024 | 114 | 0.000316 | 653 |
| | 2048 | 117 | 0.000316 | 1413 |
| 2. test dl ifft | 128 | 85.7 | 0.000316 | 80 |
| | 256 | 85.4 | 0.000316 | 175 |
| | 512 | 85.4 | 0.000316 | 375 |
| | 1024 | 85.4 | 0.000316 | 807 |
| | 2048 | 85.2 | 0.000316 | 1721 |
| 3. test dl rfft | 128 | 101 | 0.000316 | 34 |
| | 256 | 105 | 0.000316 | 73 |
| | 512 | 108 | 0.000316 | 156 |
| | 1024 | 110 | 0.000316 | 347 |
| | 2048 | 114 | 0.000316 | 714 |
| 4. test dl irfft | 128 | 85.5 | 0.000316 | 47 |
| | 256 | 85.9 | 0.000316 | 97 |
| | 512 | 84.5 | 0.000316 | 197 |
| | 1024 | 85.3 | 0.000316 | 432 |
| | 2048 | 85.4 | 0.000316 | 868 |
| 5. test dl fft s16 | 128 | 65.9 | 0.001719 | 131 |
| | 256 | 62.0 | 0.003524 | 289 |
| | 512 | 59.2 | 0.006614 | 633 |
| | 1024 | 56.8 | 0.013190 | 1374 |
| | 2048 | 53.7 | 0.026223 | 2966 |
| 6. test dl ifft s16 | 128 | 59.8 | 0.000527 | 133 |
| | 256 | 52.2 | 0.000902 | 293 |
| | 512 | 51.4 | 0.000917 | 640 |
| | 1024 | 51.1 | 0.000960 | 1387 |
| | 2048 | 45.7 | 0.001737 | 2992 |
| 7. test dl fft hp s16 | 128 | 76.1 | 0.000621 | 189 |
| | 256 | 73.6 | 0.000975 | 424 |
| | 512 | 72.6 | 0.001546 | 914 |
| | 1024 | 73.0 | 0.002084 | 2023 |
| | 2048 | 69.9 | 0.004387 | 4322 |
| 8. test dl ifft hp s16 | 128 | 72.6 | 0.000327 | 190 |
| | 256 | 71.5 | 0.000328 | 427 |
| | 512 | 68.9 | 0.000334 | 920 |
| | 1024 | 68.9 | 0.000335 | 2036 |
| | 2048 | 67.8 | 0.000345 | 4349 |
| 9. test dl rfft s16 | 128 | 63.8 | 0.001403 | 70 |
| | 256 | 60.7 | 0.002885 | 152 |
| | 512 | 58.2 | 0.005433 | 331 |
| | 1024 | 54.5 | 0.011284 | 717 |
| | 2048 | 51.9 | 0.022304 | 1542 |
| 10. test dl irfft s16 | 128 | 57.8 | 0.000596 | 70 |
| | 256 | 52.2 | 0.000937 | 153 |
| | 512 | 50.2 | 0.000984 | 334 |
| | 1024 | 45.6 | 0.001740 | 720 |
| | 2048 | 40.2 | 0.003298 | 1547 |
| 11. test dl rfft hp s16 | 128 | 75.7 | 0.000464 | 98 |
| | 256 | 74.2 | 0.000730 | 210 |
| | 512 | 72.7 | 0.001103 | 466 |
| | 1024 | 73.0 | 0.001432 | 998 |
| | 2048 | 70.3 | 0.002952 | 2190 |
| 12. test dl irfft hp s16 | 128 | 72.8 | 0.000324 | 98 |
| | 256 | 71.1 | 0.000330 | 210 |
| | 512 | 67.8 | 0.000338 | 468 |
| | 1024 | 69.5 | 0.000335 | 1001 |
| | 2048 | 67.4 | 0.000346 | 2196 |

View File

@@ -0,0 +1,126 @@
#pragma once
#include "dl_fft_base.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Single-precision floating-point FFT instance structure
* @param fft_point Number of FFT points
* @param log2n Log base 2 of FFT points
* @param fft_table FFT real to complex coefficient table
* @param rfft_table FFT complex to real coefficient table
*/
typedef struct {
int fft_point;
int log2n;
float *fft_table;
float *rfft_table;
uint16_t *bitrev_table;
int bitrev_size;
} dl_fft_f32_t;
/**
* @brief 16-bit fixed-point FFT instance structure
* @param fft_point Number of FFT points
* @param log2n Log base 2 of FFT points
* @param fft_table FFT real to complex coefficient table
* @param rfft_table FFT complex to real coefficient table
*/
typedef struct {
int fft_point;
int log2n;
int16_t *fft_table;
int16_t *rfft_table;
} dl_fft_s16_t;
/**
* @brief Initialize a single-precision floating-point FFT instance
* @param fft_point Number of FFT points (must be power of two)
* @param caps Configuration flags for memory allocation, same with esp-idf heap_caps_malloc
* (e.g., MALLOC_CAP_8BIT, MALLOC_CAP_INTERNAL, MALLOC_CAP_SPIRAM)
* @return dl_fft_f32_t* Handle to FFT instance
*/
dl_fft_f32_t *dl_fft_f32_init(int fft_point, uint32_t caps);
/**
* @brief Deinitialize a single-precision floating-point FFT instance
* @param handle FFT instance handle created by dl_fft_f32_init()
*/
void dl_fft_f32_deinit(dl_fft_f32_t *handle);
/**
* @brief Execute single-precision floating-point FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_fft_f32_run(dl_fft_f32_t *handle, float *data);
/**
* @brief Execute single-precision floating-point inverse FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place ifft calculation
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_ifft_f32_run(dl_fft_f32_t *handle, float *data);
/**
* @brief Initialize a 16-bit fixed-point FFT instance
* @param fft_point Number of FFT points (must be power of two)
* @param caps Configuration flags for memory allocation, same with esp-idf heap_caps_malloc
* (e.g., MALLOC_CAP_8BIT, MALLOC_CAP_INTERNAL, MALLOC_CAP_SPIRAM)
* @return dl_fft_s16_t* Handle to FFT instance
*/
dl_fft_s16_t *dl_fft_s16_init(int fft_point, uint32_t caps);
/**
* @brief Deinitialize a 16-bit fixed-point FFT instance
* @param handle FFT instance handle created by dl_fft_s16_init()
*/
void dl_fft_s16_deinit(dl_fft_s16_t *handle);
/**
* @brief Execute 16-bit fixed-point FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_fft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point inverse FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_ifft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point FFT with high-precision scaling
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_fft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point inverse FFT with high-precision scaling
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_ifft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,292 @@
#pragma once
#include <mutex>
#include <vector>
#include "dl_fft.h"
#include "dl_rfft.h"
namespace dl {
class FFT {
private:
// Private constructor for singleton
FFT() = default;
~FFT() = default;
// Delete copy constructor and assignment operator
FFT(const FFT &) = delete;
FFT &operator=(const FFT &) = delete;
// Four handle vectors for different FFT types
std::vector<dl_fft_f32_t *> fft_f32_handles;
std::vector<dl_fft_s16_t *> fft_s16_handles;
std::vector<dl_fft_f32_t *> rfft_f32_handles;
std::vector<dl_fft_s16_t *> rfft_s16_handles;
// Mutex for thread safety (only used during handle initialization)
std::mutex mutex_;
uint32_t m_caps = MALLOC_CAP_8BIT; // Default memory allocation capabilities
// Helper function to find or create handle
template <typename HandleType, typename InitFunc>
HandleType *get_or_create_handle(int fft_length, std::vector<HandleType *> &handles, InitFunc init_func)
{
// First check without lock (lock-free read)
for (auto *handle : handles) {
if (handle->fft_point == fft_length) {
return handle;
}
}
// Lock only for handle creation
std::lock_guard<std::mutex> lock(mutex_);
// Double-check after acquiring lock (avoid race condition)
for (auto *handle : handles) {
if (handle->fft_point == fft_length) {
return handle;
}
}
// Create new handle
HandleType *new_handle = init_func(fft_length, m_caps); // 0 for default memory allocation
if (new_handle) {
handles.push_back(new_handle);
}
return new_handle;
}
public:
// Get singleton instance
static FFT *get_instance()
{
static FFT instance;
return &instance;
}
uint32_t get_caps() { return m_caps; }
void set_caps(uint32_t caps) { m_caps = caps; }
// FFT for float32
esp_err_t fft(float *data, int fft_length)
{
dl_fft_f32_t *handle = get_or_create_handle(
fft_length, fft_f32_handles, [](int len, uint32_t caps) { return dl_fft_f32_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
return dl_fft_f32_run(handle, data);
}
// IFFT for float32
esp_err_t ifft(float *data, int fft_length)
{
dl_fft_f32_t *handle = get_or_create_handle(
fft_length, fft_f32_handles, [](int len, uint32_t caps) { return dl_fft_f32_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
return dl_ifft_f32_run(handle, data);
}
// RFFT for float32
esp_err_t rfft(float *data, int fft_length)
{
dl_fft_f32_t *handle = get_or_create_handle(
fft_length, rfft_f32_handles, [](int len, uint32_t caps) { return dl_rfft_f32_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
return dl_rfft_f32_run(handle, data);
}
// IRFFT for float32
esp_err_t irfft(float *data, int fft_length)
{
dl_fft_f32_t *handle = get_or_create_handle(
fft_length, rfft_f32_handles, [](int len, uint32_t caps) { return dl_rfft_f32_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
return dl_irfft_f32_run(handle, data);
}
// FFT for int16
esp_err_t fft(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, fft_s16_handles, [](int len, uint32_t caps) { return dl_fft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_fft_s16_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// IFFT for int16
esp_err_t ifft(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, fft_s16_handles, [](int len, uint32_t caps) { return dl_fft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_ifft_s16_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// RFFT for int16
esp_err_t rfft(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, rfft_s16_handles, [](int len, uint32_t caps) { return dl_rfft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_rfft_s16_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// IRFFT for int16
esp_err_t irfft(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, rfft_s16_handles, [](int len, uint32_t caps) { return dl_rfft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_irfft_s16_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// FFT with high precision for int16
esp_err_t fft_hp(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, fft_s16_handles, [](int len, uint32_t caps) { return dl_fft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_fft_s16_hp_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// IFFT with high precision for int16
esp_err_t ifft_hp(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, fft_s16_handles, [](int len, uint32_t caps) { return dl_fft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_ifft_s16_hp_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// RFFT with high precision for int16
esp_err_t rfft_hp(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, rfft_s16_handles, [](int len, uint32_t caps) { return dl_rfft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_rfft_s16_hp_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// IRFFT with high precision for int16
esp_err_t irfft_hp(int16_t *data, int fft_length, int in_exponent = 0, int *out_exponent = nullptr)
{
dl_fft_s16_t *handle = get_or_create_handle(
fft_length, rfft_s16_handles, [](int len, uint32_t caps) { return dl_rfft_s16_init(len, caps); });
if (!handle) {
return ESP_FAIL;
}
int temp_out_exp = 0;
esp_err_t result = dl_irfft_s16_hp_run(handle, data, in_exponent, out_exponent ? out_exponent : &temp_out_exp);
return result;
}
// WARNING: This function is NOT thread-safe with respect to concurrent FFT operations.
// It should only be called when no other FFT methods are running, as it will deinitialize all handles
// and may cause undefined behavior if other threads are using FFT functions.
// Ensure all FFT operations have completed before calling clear().
void clear()
{
ESP_LOGW("FFT",
"This function is NOT thread-safe. Ensure all FFT operations have completed before calling clear()");
std::lock_guard<std::mutex> lock(mutex_);
// Clear FFT float32 handles
for (auto *handle : fft_f32_handles) {
dl_fft_f32_deinit(handle);
}
fft_f32_handles.clear();
std::vector<dl_fft_f32_t *>().swap(fft_f32_handles);
// Clear FFT int16 handles
for (auto *handle : fft_s16_handles) {
dl_fft_s16_deinit(handle);
}
fft_s16_handles.clear();
std::vector<dl_fft_s16_t *>().swap(fft_s16_handles);
// Clear RFFT float32 handles
for (auto *handle : rfft_f32_handles) {
dl_rfft_f32_deinit(handle);
}
rfft_f32_handles.clear();
std::vector<dl_fft_f32_t *>().swap(rfft_f32_handles);
// Clear RFFT int16 handles
for (auto *handle : rfft_s16_handles) {
dl_rfft_s16_deinit(handle);
}
rfft_s16_handles.clear();
std::vector<dl_fft_s16_t *>().swap(rfft_s16_handles);
}
// Get handle count for debugging
size_t get_handle_count()
{
std::lock_guard<std::mutex> lock(mutex_);
return fft_f32_handles.size() + fft_s16_handles.size() + rfft_f32_handles.size() + rfft_s16_handles.size();
}
};
} // namespace dl

View File

@@ -0,0 +1,88 @@
#include "dl_fft.h"
#include "esp_log.h"
#include <math.h>
#include <string.h>
static const char *TAG = "dl fft";
// Create a new FFT handle
dl_fft_f32_t *dl_fft_f32_init(int fft_point, uint32_t caps)
{
if (!dl_is_power_of_two(fft_point)) {
ESP_LOGE(TAG, "FFT point must be power of two");
return NULL;
}
dl_fft_f32_t *handle = (dl_fft_f32_t *)heap_caps_malloc(sizeof(dl_fft_f32_t), caps);
if (!handle) {
ESP_LOGE(TAG, "Failed to allocate FFT handle");
return NULL;
}
handle->fft_table = NULL;
handle->rfft_table = NULL;
handle->bitrev_table = NULL;
handle->fft_point = fft_point;
handle->log2n = dl_power_of_two(fft_point);
// Allocate and generate FFT table
handle->fft_table = dl_gen_fftr2_table_f32(fft_point, caps);
if (!handle->fft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_fft_f32_deinit(handle);
return NULL;
}
handle->bitrev_table = dl_gen_bitrev2r_table(fft_point, caps, &handle->bitrev_size);
return handle;
}
// Free FFT handle
void dl_fft_f32_deinit(dl_fft_f32_t *handle)
{
if (handle) {
if (handle->fft_table) {
free(handle->fft_table);
}
if (handle->rfft_table) {
free(handle->rfft_table);
}
if (handle->bitrev_table) {
free(handle->bitrev_table);
}
free(handle);
}
}
// Perform FFT
esp_err_t dl_fft_f32_run(dl_fft_f32_t *handle, float *data)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
dl_fft2r_fc32(data, fft_point, handle->fft_table);
dl_bitrev2r_fc32_ansi(data, fft_point, handle->bitrev_table, handle->bitrev_size);
return ESP_OK;
}
esp_err_t dl_ifft_f32_run(dl_fft_f32_t *handle, float *data)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
float scale = 1.0f / fft_point;
dl_ifft2r_fc32(data, fft_point, handle->fft_table);
dl_bitrev2r_fc32_ansi(data, fft_point, handle->bitrev_table, handle->bitrev_size);
// Scale by 1/N
for (int i = 0; i < fft_point * 2; i++) {
data[i] *= scale;
}
return ESP_OK;
}

View File

@@ -0,0 +1,106 @@
#include "dl_fft.h"
#include "esp_log.h"
#include <math.h>
#include <string.h>
static const char *TAG = "dl fft";
// Create a new FFT handle
dl_fft_s16_t *dl_fft_s16_init(int fft_point, uint32_t caps)
{
if (!dl_is_power_of_two(fft_point)) {
ESP_LOGE(TAG, "FFT point must be power of two");
return NULL;
}
dl_fft_s16_t *handle = (dl_fft_s16_t *)heap_caps_malloc(sizeof(dl_fft_s16_t), caps);
if (!handle) {
ESP_LOGE(TAG, "Failed to allocate FFT handle");
return NULL;
}
handle->fft_table = NULL;
handle->rfft_table = NULL;
handle->fft_point = fft_point;
handle->log2n = dl_power_of_two(fft_point);
// Allocate and generate FFT table
handle->fft_table = dl_gen_fft_table_sc16(fft_point, caps);
if (!handle->fft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_fft_s16_deinit(handle);
return NULL;
}
return handle;
}
// Free FFT handle
void dl_fft_s16_deinit(dl_fft_s16_t *handle)
{
if (handle) {
if (handle->fft_table) {
free(handle->fft_table);
}
free(handle);
}
}
// Perform FFT
esp_err_t dl_fft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
dl_fft2r_sc16(data, fft_point, handle->fft_table);
dl_bitrev2r_sc16_ansi(data, fft_point);
out_exponent[0] = in_exponent + handle->log2n;
return ESP_OK;
}
esp_err_t dl_ifft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
dl_ifft2r_sc16(data, fft_point, handle->fft_table);
dl_bitrev2r_sc16_ansi(data, fft_point);
out_exponent[0] = in_exponent;
return ESP_OK;
}
esp_err_t dl_fft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
out_exponent[0] = 0;
dl_fft2r_sc16_hp(data, fft_point, handle->fft_table, out_exponent);
dl_bitrev2r_sc16_ansi(data, fft_point);
out_exponent[0] = in_exponent + out_exponent[0];
return ESP_OK;
}
esp_err_t dl_ifft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
out_exponent[0] = 0;
dl_ifft2r_sc16_hp(data, fft_point, handle->fft_table, out_exponent);
dl_bitrev2r_sc16_ansi(data, fft_point);
out_exponent[0] = in_exponent + out_exponent[0] - handle->log2n;
return ESP_OK;
}

View File

@@ -0,0 +1,146 @@
#pragma once
#include "dl_fft.h"
#ifdef __cplusplus
extern "C" {
#endif
/**data format for in-place rfft
input: real data, size = fft_point
output: only one side values are returned because the real-to-complex Fourier transform satisfies the conjugate
symmetry x[0] = real part of DC component x[1] = real part of fft_point/2 component x[2] = real part of 1st component
x[3] = image part of 1st component
......
x[fft_point-2] = real part of fft_point/2-1 component
x[fft_point-1] = image part of fft_point/2-1 component
*/
/**
* @brief Initialize a single-precision floating-point real FFT instance
* @param fft_point Number of FFT points (must be power of two)
* @param caps Configuration flags for memory allocation, same with esp-idf heap_caps_malloc
* (e.g., MALLOC_CAP_8BIT, MALLOC_CAP_INTERNAL, MALLOC_CAP_SPIRAM)
* @return dl_fft_f32_t* Handle to FFT instance
*/
dl_fft_f32_t *dl_rfft_f32_init(int fft_point, uint32_t caps);
/**
* @brief Deinitialize a single-precision floating-point real FFT instance
* @param handle FFT instance handle created by dl_rfft_f32_init()
*/
void dl_rfft_f32_deinit(dl_fft_f32_t *handle);
/**
* @brief Execute single-precision floating-point real FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_rfft_f32_run(dl_fft_f32_t *handle, float *data);
/**
* @brief Execute single-precision floating-point real inverse FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_irfft_f32_run(dl_fft_f32_t *handle, float *data);
/**
* @brief Execute 16-bit fixed-point FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_fft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute inverse 16-bit fixed-point FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_ifft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point FFT with high-precision scaling
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_fft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute inverse 16-bit fixed-point FFT with high-precision scaling
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_ifft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Initialize a 16-bit fixed-point real FFT instance
* @param fft_point Number of FFT points (must be power of two)
* @param caps Configuration flags for memory allocation, same with esp-idf heap_caps_malloc
* (e.g., MALLOC_CAP_8BIT, MALLOC_CAP_INTERNAL, MALLOC_CAP_SPIRAM)
* @return dl_fft_s16_t* Handle to FFT instance
*/
dl_fft_s16_t *dl_rfft_s16_init(int fft_point, uint32_t caps);
/**
* @brief Deinitialize a 16-bit fixed-point real FFT instance
* @param handle FFT instance handle created by dl_rfft_s16_init()
*/
void dl_rfft_s16_deinit(dl_fft_s16_t *handle);
/**
* @brief Execute 16-bit fixed-point real FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_rfft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point real FFT with high-precision scaling
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_rfft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point real inverse FFT transform
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_irfft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
/**
* @brief Execute 16-bit fixed-point real inverse FFT with high-precision scaling
* @param handle FFT instance handle
* @param data Input/output buffer, in-place fft calculation
* @param in_exponent Input data exponent (2^in_exponent scaling factor)
* @param out_exponent Output data exponent (2^out_exponent scaling factor)
* @return esp_err_t ESP_OK on success, error code otherwise
*/
esp_err_t dl_irfft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent);
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,120 @@
#include "dl_rfft.h"
#include "esp_attr.h"
#include "esp_log.h"
#include <math.h>
#include <string.h>
static const char *TAG = "dl rfft";
dl_fft_f32_t *dl_rfft_f32_init(int fft_point, uint32_t caps)
{
if (!dl_is_power_of_two(fft_point)) {
ESP_LOGE(TAG, "FFT point must be power of two");
return NULL;
}
dl_fft_f32_t *handle = (dl_fft_f32_t *)heap_caps_malloc(sizeof(dl_fft_f32_t), caps);
if (!handle) {
ESP_LOGE(TAG, "Failed to allocate FFT handle");
return NULL;
}
handle->fft_table = NULL;
handle->rfft_table = NULL;
handle->bitrev_table = NULL;
handle->fft_point = fft_point;
handle->log2n = dl_power_of_two(fft_point);
// rfft table
handle->rfft_table = dl_gen_rfft_table_f32(fft_point, caps);
if (!handle->rfft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_rfft_f32_deinit(handle);
return NULL;
}
if (handle->log2n % 2 == 1) {
handle->bitrev_table = dl_gen_bitrev4r_table(fft_point, caps, &handle->bitrev_size);
handle->fft_table = dl_gen_fft4r_table_f32(fft_point, caps);
if (!handle->fft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_rfft_f32_deinit(handle);
return NULL;
}
} else {
handle->bitrev_table = dl_gen_bitrev2r_table(fft_point >> 1, caps, &handle->bitrev_size);
handle->fft_table = dl_gen_fftr2_table_f32(fft_point >> 1, caps);
if (!handle->fft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_rfft_f32_deinit(handle);
return NULL;
}
}
return handle;
}
void dl_rfft_f32_deinit(dl_fft_f32_t *handle)
{
if (handle) {
if (handle->fft_table) {
free(handle->fft_table);
}
if (handle->rfft_table) {
free(handle->rfft_table);
}
if (handle->bitrev_table) {
free(handle->bitrev_table);
}
free(handle);
}
}
esp_err_t dl_rfft_f32_run(dl_fft_f32_t *handle, float *data)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
float *fft_table = handle->fft_table;
float *rfft_table = handle->rfft_table;
if (handle->log2n % 2 == 1) {
dl_fft4r_fc32(data, fft_point >> 1, fft_table, fft_point);
dl_bitrev4r_fc32_ansi(data, fft_point >> 1, handle->bitrev_table, handle->bitrev_size);
} else {
dl_fft2r_fc32(data, fft_point >> 1, fft_table);
dl_bitrev2r_fc32_ansi(data, fft_point >> 1, handle->bitrev_table, handle->bitrev_size);
}
// Convert one complex vector with length N/2 to one real spectrum vector with length N/2
dl_rfft_post_proc_fc32_ansi(data, fft_point >> 1, rfft_table);
return ESP_OK;
}
esp_err_t dl_irfft_f32_run(dl_fft_f32_t *handle, float *data)
{
if (!handle || !data) {
return ESP_FAIL;
}
int fft_point = handle->fft_point;
float *fft_table = handle->fft_table;
float *rfft_table = handle->rfft_table;
float scale = 2.0 / fft_point;
dl_rfft_pre_proc_fc32_ansi(data, fft_point >> 1, rfft_table);
if (handle->log2n % 2 == 1) {
dl_ifft4r_fc32(data, fft_point >> 1, fft_table, fft_point);
dl_bitrev4r_fc32_ansi(data, fft_point >> 1, handle->bitrev_table, handle->bitrev_size);
} else {
dl_ifft2r_fc32(data, fft_point >> 1, fft_table);
dl_bitrev2r_fc32_ansi(data, fft_point >> 1, handle->bitrev_table, handle->bitrev_size);
}
// Scale by 1/N
for (int i = 0; i < fft_point; i++) {
data[i] *= scale;
}
return ESP_OK;
}

View File

@@ -0,0 +1,119 @@
#include "dl_rfft.h"
#include "esp_log.h"
#include <math.h>
#include <string.h>
static const char *TAG = "dl rfft";
dl_fft_s16_t *dl_rfft_s16_init(int fft_point, uint32_t caps)
{
dl_fft_s16_t *handle = (dl_fft_s16_t *)heap_caps_malloc(sizeof(dl_fft_s16_t), caps);
if (!handle) {
ESP_LOGE(TAG, "Failed to allocate FFT handle");
return NULL;
}
handle->fft_table = NULL;
handle->rfft_table = NULL;
handle->fft_point = fft_point;
handle->log2n = dl_power_of_two(fft_point);
// rfft table
handle->rfft_table = dl_gen_rfft_table_s16(fft_point, caps);
if (!handle->rfft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_rfft_s16_deinit(handle);
return NULL;
}
// fft table
handle->fft_table = dl_gen_fft_table_sc16(fft_point >> 1, caps);
if (!handle->fft_table) {
ESP_LOGE(TAG, "Failed to generate FFT table");
dl_rfft_s16_deinit(handle);
return NULL;
}
return handle;
}
// Free FFT handle
void dl_rfft_s16_deinit(dl_fft_s16_t *handle)
{
if (handle) {
if (handle->fft_table) {
free(handle->fft_table);
}
if (handle->rfft_table) {
free(handle->rfft_table);
}
free(handle);
}
}
// Perform FFT
esp_err_t dl_rfft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int cpx_point = handle->fft_point >> 1;
dl_fft2r_sc16(data, cpx_point, handle->fft_table);
dl_bitrev2r_sc16_ansi(data, cpx_point);
dl_rfft_post_proc_sc16_ansi(data, cpx_point, handle->rfft_table);
out_exponent[0] = in_exponent + handle->log2n;
return ESP_OK;
}
esp_err_t dl_rfft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int cpx_point = handle->fft_point >> 1;
out_exponent[0] = 0;
dl_fft2r_sc16_hp(data, cpx_point, handle->fft_table, out_exponent);
dl_bitrev2r_sc16_ansi(data, cpx_point);
dl_rfft_post_proc_sc16_ansi(data, cpx_point, handle->rfft_table);
out_exponent[0] = in_exponent + out_exponent[0] + 1;
return ESP_OK;
}
esp_err_t dl_irfft_s16_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int cpx_point = handle->fft_point >> 1;
out_exponent[0] = 0;
dl_rfft_pre_proc_sc16_ansi(data, cpx_point, handle->rfft_table);
dl_ifft2r_sc16(data, cpx_point, handle->fft_table);
dl_bitrev2r_sc16_ansi(data, cpx_point);
out_exponent[0] = in_exponent + 1;
return ESP_OK;
}
esp_err_t dl_irfft_s16_hp_run(dl_fft_s16_t *handle, int16_t *data, int in_exponent, int *out_exponent)
{
if (!handle || !data) {
return ESP_FAIL;
}
int cpx_point = handle->fft_point >> 1;
out_exponent[0] = 0;
dl_rfft_pre_proc_sc16_ansi(data, cpx_point, handle->rfft_table);
dl_ifft2r_sc16_hp(data, cpx_point, handle->fft_table, out_exponent);
dl_bitrev2r_sc16_ansi(data, cpx_point);
out_exponent[0] = in_exponent + out_exponent[0] + 2 - handle->log2n;
return ESP_OK;
}

View File

@@ -0,0 +1,10 @@
dependencies:
idf: '>=5.0'
description: dl_fft is a lightweight and efficient fft library for all espressif chips.
license: MIT
repository: git://github.com/espressif/esp-dl.git
repository_info:
commit_sha: 48f53066553a3483d6c710998609aedff0ea20bc
path: tools/dl_fft
url: https://github.com/espressif/esp-dl/tree/master/esp-dl/tools/dl_fft
version: 0.3.1