add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,258 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dl_fft_base.h"
// unsigned short reverse(unsigned short x, unsigned short N, int order);
esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float *w)
{
esp_err_t result = ESP_OK;
int ie, ia, m;
float re_temp, im_temp;
float c, s;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
c = w[2 * j];
s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
re_temp = c * data[2 * m] + s * data[2 * m + 1];
im_temp = c * data[2 * m + 1] - s * data[2 * m];
data[2 * m] = data[2 * ia] - re_temp;
data[2 * m + 1] = data[2 * ia + 1] - im_temp;
data[2 * ia] = data[2 * ia] + re_temp;
data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_ifft2r_fc32_ansi(float *data, int N, float *w)
{
esp_err_t result = ESP_OK;
int ie, ia, m;
float re_temp, im_temp;
float c, s;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
c = w[2 * j];
s = -w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
re_temp = c * data[2 * m] + s * data[2 * m + 1];
im_temp = c * data[2 * m + 1] - s * data[2 * m];
data[2 * m] = data[2 * ia] - re_temp;
data[2 * m + 1] = data[2 * ia + 1] - im_temp;
data[2 * ia] = data[2 * ia] + re_temp;
data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_bitrev2r_fc32_ansi(float *data, int N, uint16_t *bitrev_table, int bitrev_size)
{
esp_err_t result = ESP_OK;
if (bitrev_table) {
float r_temp, i_temp;
for (int n = 0; n < bitrev_size; n++) {
uint16_t i = bitrev_table[n * 2];
uint16_t j = bitrev_table[n * 2 + 1];
r_temp = data[j];
data[j] = data[i];
data[i] = r_temp;
i_temp = data[j + 1];
data[j + 1] = data[i + 1];
data[i + 1] = i_temp;
}
} else {
int j, k;
float r_temp, i_temp;
j = 0;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
r_temp = data[j * 2];
data[j * 2] = data[i * 2];
data[i * 2] = r_temp;
i_temp = data[j * 2 + 1];
data[j * 2 + 1] = data[i * 2 + 1];
data[i * 2 + 1] = i_temp;
}
}
}
return result;
}
esp_err_t dl_rfft_post_proc_fc32_ansi(float *data, int N, float *table)
{
dl_fc32_t *result = (dl_fc32_t *)data;
// Original formula...
// result[0].re = result[0].re + result[0].im;
// result[N].re = result[0].re - result[0].im;
// result[0].im = 0;
// result[N].im = 0;
// Optimized one:
float tmp_re = result[0].re;
result[0].re = tmp_re + result[0].im;
result[0].im = tmp_re - result[0].im;
dl_fc32_t f1k, f2k;
for (int k = 1; k <= N / 2; k++) {
dl_fc32_t fpk = result[k];
dl_fc32_t fpnk = result[N - k];
f1k.re = fpk.re + fpnk.re;
f1k.im = fpk.im - fpnk.im;
f2k.re = fpk.re - fpnk.re;
f2k.im = fpk.im + fpnk.im;
float c = -table[k * 2 - 1];
float s = -table[k * 2 - 2];
dl_fc32_t tw;
tw.re = c * f2k.re - s * f2k.im;
tw.im = s * f2k.re + c * f2k.im;
result[k].re = 0.5 * (f1k.re + tw.re);
result[k].im = 0.5 * (f1k.im + tw.im);
result[N - k].re = 0.5 * (f1k.re - tw.re);
result[N - k].im = 0.5 * (tw.im - f1k.im);
}
return ESP_OK;
}
esp_err_t dl_rfft_pre_proc_fc32_ansi(float *data, int N, float *table)
{
dl_fc32_t *result = (dl_fc32_t *)data;
float tmp_re = result[0].re;
result[0].re = (tmp_re + result[0].im) * 0.5;
result[0].im = (tmp_re - result[0].im) * 0.5;
dl_fc32_t f1k, f2k;
for (int k = 1; k <= N / 2; k++) {
dl_fc32_t fpk = result[k];
dl_fc32_t fpnk = result[N - k];
f1k.re = fpk.re + fpnk.re;
f1k.im = fpk.im - fpnk.im;
f2k.re = fpk.re - fpnk.re;
f2k.im = fpk.im + fpnk.im;
float c = -table[k * 2 - 1];
float s = table[k * 2 - 2];
dl_fc32_t tw;
tw.re = c * f2k.re - s * f2k.im;
tw.im = s * f2k.re + c * f2k.im;
result[k].re = 0.5 * (f1k.re + tw.re);
result[k].im = 0.5 * (f1k.im + tw.im);
result[N - k].re = 0.5 * (f1k.re - tw.re);
result[N - k].im = 0.5 * (tw.im - f1k.im);
}
return ESP_OK;
}
float *dl_gen_rfft_table_f32(int fft_point, uint32_t caps)
{
float *fft_table = (float *)heap_caps_aligned_alloc(16, fft_point * sizeof(float), caps);
if (fft_table) {
for (int i = 1; i <= fft_point >> 1; i++) {
float angle = 2 * M_PI * i * 1.0 / fft_point;
fft_table[2 * i - 2] = cosf(angle);
fft_table[2 * i - 1] = sinf(angle);
}
}
return fft_table;
}
uint16_t *dl_gen_bitrev2r_table(int N, uint32_t caps, int *bitrev_size)
{
int count = 0, idx = 0;
int j = 0, k;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
count++;
}
}
if (count * 2 > UINT16_MAX) {
return NULL;
}
bitrev_size[0] = count;
uint16_t *bitrev_table = (uint16_t *)heap_caps_malloc(2 * count * sizeof(uint16_t), caps);
if (bitrev_table) {
j = 0;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
bitrev_table[idx * 2] = j * 2;
bitrev_table[idx * 2 + 1] = i * 2;
idx++;
}
}
}
return bitrev_table;
}
float *dl_gen_fftr2_table_f32(int fft_point, uint32_t caps)
{
float *fft_table = (float *)heap_caps_aligned_alloc(16, fft_point * sizeof(float), caps);
if (fft_table) {
float e = M_PI * 2.0 / fft_point;
for (int i = 0; i < (fft_point >> 1); i++) {
fft_table[2 * i] = cosf(i * e);
fft_table[2 * i + 1] = sinf(i * e);
}
dl_bitrev2r_fc32_ansi(fft_table, fft_point >> 1, NULL, 0);
}
return fft_table;
}

View File

@@ -0,0 +1,580 @@
#include "dl_fft_base.h"
static inline int16_t dl_xtfixed_bf_1(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result -= (int32_t)a1 * (int32_t)a2 + (int32_t)a3 * (int32_t)a4;
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
static inline int16_t dl_xtfixed_bf_2(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result -= ((int32_t)a1 * (int32_t)a2 - (int32_t)a3 * (int32_t)a4);
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
static inline int16_t dl_xtfixed_bf_3(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result += (int32_t)a1 * (int32_t)a2 + (int32_t)a3 * (int32_t)a4;
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
static inline int16_t dl_xtfixed_bf_4(
int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift, int add_rount_mult)
{
int result = a0;
result = result << 15;
result += (int32_t)a1 * (int32_t)a2 - (int32_t)a3 * (int32_t)a4;
result += add_rount_mult;
result = result >> result_shift;
return (int16_t)result;
}
esp_err_t dl_fft2r_sc16_ansi(int16_t *data, int N, int16_t *table)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
cs.data = w[j];
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_ifft2r_sc16_ansi(int16_t *data, int N, int16_t *table)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
for (int j = 0; j < ie; j++) {
cs.data = w[j];
cs.im = -cs.im;
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
16,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
16,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_fft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m, loop_num = 2;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
shift[0] = 0;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
int loop_shift = 16;
if (loop_num == 2) {
loop_shift = dl_array_max_q_s16(data, N * 2);
if (loop_shift < 16) {
loop_shift += 1;
}
loop_num = 0;
} else {
loop_num += 1;
}
shift[0] += loop_shift - 15;
add_rount_mult = 1 << (loop_shift - 1);
for (int j = 0; j < ie; j++) {
cs.data = w[j];
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
esp_err_t dl_ifft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift)
{
esp_err_t result = ESP_OK;
uint32_t *w = (uint32_t *)table;
uint32_t *in_data = (uint32_t *)data;
int ie, ia, m, loop_num = 2;
dl_sc16_t cs; // c - re, s - im
dl_sc16_t m_data;
dl_sc16_t a_data;
int add_rount_mult = 1 << 15;
ie = 1;
shift[0] = 0;
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
ia = 0;
int loop_shift = 16;
if (loop_num == 2) {
loop_shift = dl_array_max_q_s16(data, N * 2);
if (loop_shift < 16) {
loop_shift += 1;
}
loop_num = 0;
} else {
loop_num += 1;
}
shift[0] += loop_shift - 15;
add_rount_mult = 1 << (loop_shift - 1);
for (int j = 0; j < ie; j++) {
cs.data = w[j];
cs.im = -cs.im;
// c = w[2 * j];
// s = w[2 * j + 1];
for (int i = 0; i < N2; i++) {
m = ia + N2;
m_data.data = in_data[m];
a_data.data = in_data[ia];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
dl_sc16_t m1;
m1.re = dl_xtfixed_bf_1(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re - temp.re + shift_const) >> 1;
m1.im = dl_xtfixed_bf_2(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im - temp.im + shift_const) >> 1;
in_data[m] = m1.data;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
dl_sc16_t m2;
m2.re = dl_xtfixed_bf_3(a_data.re,
cs.re,
m_data.re,
cs.im,
m_data.im,
loop_shift,
add_rount_mult); //(a_data.re + temp.re + shift_const) >> 1;
m2.im = dl_xtfixed_bf_4(a_data.im,
cs.re,
m_data.im,
cs.im,
m_data.re,
loop_shift,
add_rount_mult); //(a_data.im + temp.im + shift_const)>>1;
in_data[ia] = m2.data;
ia++;
}
ia += N2;
}
ie <<= 1;
}
return result;
}
static inline unsigned short reverse_sc16(unsigned short x, unsigned short N, int order)
{
unsigned short b = x;
b = (b & 0xff00) >> 8 | (b & 0x00fF) << 8;
b = (b & 0xf0F0) >> 4 | (b & 0x0f0F) << 4;
b = (b & 0xCCCC) >> 2 | (b & 0x3333) << 2;
b = (b & 0xAAAA) >> 1 | (b & 0x5555) << 1;
return b >> (16 - order);
}
esp_err_t dl_bitrev2r_sc16_ansi(int16_t *data, int N)
{
esp_err_t result = ESP_OK;
int j, k;
uint32_t temp;
uint32_t *in_data = (uint32_t *)data;
j = 0;
for (int i = 1; i < (N - 1); i++) {
k = N >> 1;
while (k <= j) {
j -= k;
k >>= 1;
}
j += k;
if (i < j) {
temp = in_data[j];
in_data[j] = in_data[i];
in_data[i] = temp;
}
}
return result;
}
esp_err_t dl_cplx2reC_sc16(int16_t *data, int N)
{
esp_err_t result = ESP_OK;
int i;
int n2 = N << (1); // we will operate with int32 indexes
uint32_t *in_data = (uint32_t *)data;
dl_sc16_t kl;
dl_sc16_t kh;
dl_sc16_t nl;
dl_sc16_t nh;
for (i = 0; i < (N / 4); i++) {
kl.data = in_data[i + 1];
nl.data = in_data[N - i - 1];
kh.data = in_data[i + 1 + N / 2];
nh.data = in_data[N - i - 1 - N / 2];
data[i * 2 + 0 + 2] = kl.re + nl.re;
data[i * 2 + 1 + 2] = kl.im - nl.im;
data[n2 - i * 2 - 1 - N] = kh.re + nh.re;
data[n2 - i * 2 - 2 - N] = kh.im - nh.im;
data[i * 2 + 0 + 2 + N] = kl.im + nl.im;
data[i * 2 + 1 + 2 + N] = kl.re - nl.re;
data[n2 - i * 2 - 1] = kh.im + nh.im;
data[n2 - i * 2 - 2] = kh.re - nh.re;
}
data[N] = data[1];
data[1] = 0;
data[N + 1] = 0;
return result;
}
esp_err_t dl_rfft_post_proc_sc16_ansi(int16_t *data, int N, int16_t *table)
{
dl_sc16_t *result = (dl_sc16_t *)data;
// Original formula...
// result[0].re = result[0].re + result[0].im;
// result[N].re = result[0].re - result[0].im;
// result[0].im = 0;
// result[N].im = 0;
// Optimized one:
int32_t tmp_re = result[0].re + 1;
result[0].re = (tmp_re + result[0].im) >> 1;
result[0].im = (tmp_re - result[0].im) >> 1;
int round = 1 << 16;
int32_t f1k_re, f1k_im, f2k_re, f2k_im, tw_re, tw_im;
for (int k = 1; k <= N / 2; k++) {
dl_sc16_t fpk = result[k];
dl_sc16_t fpnk = result[N - k];
f1k_re = fpk.re + fpnk.re;
f1k_im = fpk.im - fpnk.im;
f2k_re = fpk.re - fpnk.re;
f2k_im = fpk.im + fpnk.im;
int16_t c = -table[k * 2 - 1];
int16_t s = -table[k * 2 - 2];
tw_re = c * f2k_re - s * f2k_im;
tw_im = s * f2k_re + c * f2k_im;
f1k_re = f1k_re << 15;
f1k_im = f1k_im << 15;
result[k].re = (f1k_re + tw_re + round) >> 17;
result[k].im = (f1k_im + tw_im + round) >> 17;
result[N - k].re = (f1k_re - tw_re + round) >> 17;
result[N - k].im = (tw_im - f1k_im + round) >> 17;
}
return ESP_OK;
}
esp_err_t dl_rfft_pre_proc_sc16_ansi(int16_t *data, int N, int16_t *table)
{
dl_sc16_t *result = (dl_sc16_t *)data;
int32_t tmp_re = result[0].re + 2;
result[0].re = (tmp_re + result[0].im) >> 2;
result[0].im = (tmp_re - result[0].im) >> 2;
int round = 1 << 16;
int32_t f1k_re, f1k_im, f2k_re, f2k_im, tw_re, tw_im;
for (int k = 1; k <= N / 2; k++) {
dl_sc16_t fpk = result[k];
dl_sc16_t fpnk = result[N - k];
f1k_re = fpk.re + fpnk.re;
f1k_im = fpk.im - fpnk.im;
f2k_re = fpk.re - fpnk.re;
f2k_im = fpk.im + fpnk.im;
int16_t c = -table[k * 2 - 1];
int16_t s = table[k * 2 - 2];
tw_re = c * f2k_re - s * f2k_im;
tw_im = s * f2k_re + c * f2k_im;
f1k_re = f1k_re << 15;
f1k_im = f1k_im << 15;
result[k].re = (f1k_re + tw_re + round) >> 17;
result[k].im = (f1k_im + tw_im + round) >> 17;
result[N - k].re = (f1k_re - tw_re + round) >> 17;
result[N - k].im = (tw_im - f1k_im + round) >> 17;
}
return ESP_OK;
}
esp_err_t dl_cplx2real_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift)
{
dl_sc16_t *result = (dl_sc16_t *)data;
// Original formula...
// result[0].re = result[0].re + result[0].im;
// result[N].re = result[0].re - result[0].im;
// result[0].im = 0;
// result[N].im = 0;
// Optimized one:
int loop_shift = dl_array_max_q_s16(data, N);
int round = 1 << loop_shift;
int32_t tmp_re = result[0].re;
shift[0] += loop_shift - 15;
if (loop_shift >= 15) {
result[0].re = (tmp_re + result[0].im) >> (loop_shift - 15);
result[0].im = (tmp_re - result[0].im) >> (loop_shift - 15);
} else {
result[0].re = (tmp_re + result[0].im) << (15 - loop_shift);
result[0].im = (tmp_re - result[0].im) << (15 - loop_shift);
}
int32_t f1k_re, f1k_im, f2k_re, f2k_im, tw_re, tw_im;
loop_shift += 1;
for (int k = 1; k <= N / 2; k++) {
dl_sc16_t fpk = result[k];
dl_sc16_t fpnk = result[N - k];
f1k_re = fpk.re + fpnk.re;
f1k_im = fpk.im - fpnk.im;
f2k_re = fpk.re - fpnk.re;
f2k_im = fpk.im + fpnk.im;
int16_t c = -table[k * 2 - 1];
int16_t s = -table[k * 2 - 2];
tw_re = c * f2k_re - s * f2k_im;
tw_im = s * f2k_re + c * f2k_im;
f1k_re = f1k_re << 15;
f1k_im = f1k_im << 15;
result[k].re = (f1k_re + tw_re + round) >> loop_shift;
result[k].im = (f1k_im + tw_im + round) >> loop_shift;
result[N - k].re = (f1k_re - tw_re + round) >> loop_shift;
result[N - k].im = (tw_im - f1k_im + round) >> loop_shift;
}
return ESP_OK;
}
int16_t *dl_gen_fft_table_sc16(int fft_point, uint32_t caps)
{
int16_t *fft_table = (int16_t *)heap_caps_aligned_alloc(16, fft_point * sizeof(int16_t), caps);
if (fft_table) {
float e = M_PI * 2.0 / fft_point;
for (int i = 0; i < (fft_point >> 1); i++) {
fft_table[2 * i] = (int16_t)roundf(INT16_MAX * cosf(i * e));
fft_table[2 * i + 1] = (int16_t)roundf(INT16_MAX * sinf(i * e));
}
dl_bitrev2r_sc16_ansi(fft_table, fft_point >> 1);
}
return fft_table;
}
int16_t *dl_gen_rfft_table_s16(int fft_point, uint32_t caps)
{
int16_t *fft_table = (int16_t *)heap_caps_aligned_alloc(16, fft_point * sizeof(int16_t), caps);
if (fft_table) {
float e = M_PI * 2.0 / fft_point;
for (int i = 0; i < (fft_point >> 1); i++) {
fft_table[2 * i] = (int16_t)roundf(INT16_MAX * cosf((i + 1) * e));
fft_table[2 * i + 1] = (int16_t)roundf(INT16_MAX * sinf((i + 1) * e));
}
}
return fft_table;
}

View File

@@ -0,0 +1,277 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dl_fft_base.h"
esp_err_t dl_fft4r_fc32_ansi(float *data, int length, float *table, int table_size)
{
dl_fc32_t bfly[4];
int log2N = dl_power_of_two(length);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
return ESP_FAIL;
}
int m = 2;
int wind_step = 2;
while (1) { /// radix 4
if (log4N == 0) {
break;
}
length = length >> 2;
for (int j = 0; j < m; j += 2) { // j: which FFT of this step
int start_index = j * (length << 1); // n: n-point FFT
dl_fc32_t *ptrc0 = (dl_fc32_t *)data + start_index;
dl_fc32_t *ptrc1 = ptrc0 + length;
dl_fc32_t *ptrc2 = ptrc1 + length;
dl_fc32_t *ptrc3 = ptrc2 + length;
dl_fc32_t *winc0 = (dl_fc32_t *)table;
dl_fc32_t *winc1 = winc0;
dl_fc32_t *winc2 = winc0;
for (int k = 0; k < length; k++) {
dl_fc32_t in0 = *ptrc0;
dl_fc32_t in2 = *ptrc2;
dl_fc32_t in1 = *ptrc1;
dl_fc32_t in3 = *ptrc3;
bfly[0].re = in0.re + in2.re + in1.re + in3.re;
bfly[0].im = in0.im + in2.im + in1.im + in3.im;
bfly[1].re = in0.re - in2.re + in1.im - in3.im;
bfly[1].im = in0.im - in2.im - in1.re + in3.re;
bfly[2].re = in0.re + in2.re - in1.re - in3.re;
bfly[2].im = in0.im + in2.im - in1.im - in3.im;
bfly[3].re = in0.re - in2.re - in1.im + in3.im;
bfly[3].im = in0.im - in2.im + in1.re - in3.re;
*ptrc0 = bfly[0];
ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
winc0 += 1 * wind_step;
winc1 += 2 * wind_step;
winc2 += 3 * wind_step;
ptrc0++;
ptrc1++;
ptrc2++;
ptrc3++;
}
}
m = m << 2;
wind_step = wind_step << 2;
log4N--;
}
return ESP_OK;
}
esp_err_t dl_ifft4r_fc32_ansi(float *data, int length, float *table, int table_size)
{
dl_fc32_t bfly[4];
int log2N = dl_power_of_two(length);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
return ESP_FAIL;
}
int m = 2;
int wind_step = 2;
while (1) { /// radix 4
if (log4N == 0) {
break;
}
length = length >> 2;
for (int j = 0; j < m; j += 2) { // j: which FFT of this step
int start_index = j * (length << 1); // n: n-point FFT
dl_fc32_t *ptrc0 = (dl_fc32_t *)data + start_index;
dl_fc32_t *ptrc1 = ptrc0 + length;
dl_fc32_t *ptrc2 = ptrc1 + length;
dl_fc32_t *ptrc3 = ptrc2 + length;
dl_fc32_t *winc0 = (dl_fc32_t *)table;
dl_fc32_t *winc1 = winc0;
dl_fc32_t *winc2 = winc0;
for (int k = 0; k < length; k++) {
dl_fc32_t in0 = *ptrc0;
dl_fc32_t in2 = *ptrc2;
dl_fc32_t in1 = *ptrc1;
dl_fc32_t in3 = *ptrc3;
bfly[0].re = in0.re + in2.re + in1.re + in3.re;
bfly[0].im = in0.im + in2.im + in1.im + in3.im;
bfly[1].re = in0.re - in2.re - in1.im + in3.im; // this fft & ifft is different
bfly[1].im = in0.im - in2.im + in1.re - in3.re; // this fft & ifft is different
bfly[2].re = in0.re + in2.re - in1.re - in3.re;
bfly[2].im = in0.im + in2.im - in1.im - in3.im;
bfly[3].re = in0.re - in2.re + in1.im - in3.im; // this fft & ifft is different
bfly[3].im = in0.im - in2.im - in1.re + in3.re; // this fft & ifft is different
*ptrc0 = bfly[0];
ptrc1->re = bfly[1].re * winc0->re - bfly[1].im * winc0->im; // this fft & ifft is different
ptrc1->im = bfly[1].im * winc0->re + bfly[1].re * winc0->im; // this fft & ifft is different
ptrc2->re = bfly[2].re * winc1->re - bfly[2].im * winc1->im; // this fft & ifft is different
ptrc2->im = bfly[2].im * winc1->re + bfly[2].re * winc1->im; // this fft & ifft is different
ptrc3->re = bfly[3].re * winc2->re - bfly[3].im * winc2->im; // this fft & ifft is different
ptrc3->im = bfly[3].im * winc2->re + bfly[3].re * winc2->im; // this fft & ifft is different
winc0 += 1 * wind_step;
winc1 += 2 * wind_step;
winc2 += 3 * wind_step;
ptrc0++;
ptrc1++;
ptrc2++;
ptrc3++;
}
}
m = m << 2;
wind_step = wind_step << 2;
log4N--;
}
return ESP_OK;
}
esp_err_t dl_bitrev4r_fc32_ansi(float *data, int N, uint16_t *bitrev_table, int bitrev_size)
{
esp_err_t result = ESP_OK;
if (bitrev_table) {
float r_temp, i_temp;
for (int n = 0; n < bitrev_size; n++) {
uint16_t i = bitrev_table[n * 2];
uint16_t j = bitrev_table[n * 2 + 1];
r_temp = data[j];
i_temp = data[j + 1];
data[j] = data[i];
data[i] = r_temp;
data[j + 1] = data[i + 1];
data[i + 1] = i_temp;
}
} else {
int log2N = dl_power_of_two(N);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
return ESP_FAIL;
}
float r_temp, i_temp;
for (int i = 0; i < N; i++) {
int cnt;
int xx;
int bits2;
xx = 0;
cnt = log4N;
int j = i;
while (cnt > 0) {
bits2 = j & 0x3;
xx = (xx << 2) + bits2;
j = j >> 2;
cnt--;
}
if (i < xx) {
r_temp = data[i * 2 + 0];
i_temp = data[i * 2 + 1];
data[i * 2 + 0] = data[xx * 2 + 0];
data[i * 2 + 1] = data[xx * 2 + 1];
data[xx * 2 + 0] = r_temp;
data[xx * 2 + 1] = i_temp;
}
}
}
return result;
}
uint16_t *dl_gen_bitrev4r_table(int N, uint32_t caps, int *bitrev_size)
{
int log2N = dl_power_of_two(N);
int log4N = log2N >> 1;
if ((log2N & 0x01) != 0) {
bitrev_size[0] = 0;
return NULL;
}
int count = 0, idx = 0;
for (int i = 0; i < N; i++) {
int cnt;
int xx;
int bits2;
xx = 0;
cnt = log4N;
int j = i;
while (cnt > 0) {
bits2 = j & 0x3;
xx = (xx << 2) + bits2;
j = j >> 2;
cnt--;
}
if (i < xx) {
count++;
}
}
if (count * 2 > UINT16_MAX) {
return NULL;
}
bitrev_size[0] = count;
uint16_t *bitrev_table = (uint16_t *)heap_caps_malloc(2 * count * sizeof(uint16_t), caps);
if (bitrev_table) {
for (int i = 0; i < N; i++) {
int cnt;
int xx;
int bits2;
xx = 0;
cnt = log4N;
int j = i;
while (cnt > 0) {
bits2 = j & 0x3;
xx = (xx << 2) + bits2;
j = j >> 2;
cnt--;
}
if (i < xx) {
bitrev_table[idx * 2] = i * 2;
bitrev_table[idx * 2 + 1] = xx * 2;
idx++;
}
}
}
return bitrev_table;
}
float *dl_gen_fft4r_table_f32(int fft_point, uint32_t caps)
{
float *fft_table = (float *)heap_caps_aligned_alloc(16, fft_point * sizeof(float) * 2, caps);
if (fft_table) {
for (int i = 0; i < fft_point; i++) {
float angle = 2 * M_PI * i * 1.0 / fft_point;
fft_table[2 * i] = cosf(angle);
fft_table[2 * i + 1] = sinf(angle);
}
}
return fft_table;
}

View File

@@ -0,0 +1,92 @@
#include "dl_fft_base.h"
bool dl_is_power_of_two(int x)
{
return (x != 0) && ((x & (x - 1)) == 0);
}
int dl_power_of_two(uint32_t n)
{
int pos = 0;
if (n >= 1 << 16) {
n >>= 16;
pos += 16;
}
if (n >= 1 << 8) {
n >>= 8;
pos += 8;
}
if (n >= 1 << 4) {
n >>= 4;
pos += 4;
}
if (n >= 1 << 2) {
n >>= 2;
pos += 2;
}
if (n >= 1 << 1) {
pos += 1;
}
return pos;
}
float *dl_short_to_float(const int16_t *x, int len, int exponent, float *y)
{
float scale = powf(2, exponent);
// printf("scale: %f\n", scale);
for (int i = 0; i < len; i++) {
y[i] = scale * x[i];
}
return y;
}
int16_t dl_array_max_q_s16(const int16_t *x, int size)
{
int16_t max = 0;
for (int i = 1; i < size; i++) {
if (x[i] > max) {
max = x[i];
} else if (-x[i] > max) {
max = -x[i];
}
}
if (max == 0) {
return 1;
}
int16_t k = 2;
while (max > 1) {
k++;
max = max >> 1;
}
return k;
}
int dl_array_max_q_f32(const float *x, int size, float eps)
{
float max = 0;
for (int i = 1; i < size; i++) {
if (x[i] > max) {
max = x[i];
} else if (-x[i] > max) {
max = -x[i];
}
}
int max_int = ceilf(max + eps);
return dl_power_of_two(max_int);
}
int dl_float_to_short(const float *x, int len, int16_t *y, int out_exponent)
{
int exponent = out_exponent - dl_array_max_q_f32(x, len, 1e-8);
float scale = powf(2, exponent);
for (int i = 0; i < len; i++) {
y[i] = (int16_t)roundf(x[i] * scale);
}
return -exponent;
}

View File

@@ -0,0 +1,88 @@
#pragma once
#include "dl_fft_dtype.h"
#include "esp_attr.h"
#include "esp_err.h"
#include "esp_heap_caps.h"
#include "esp_log.h"
#include <math.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include "dl_fft_platform.h"
// common function
bool dl_is_power_of_two(int x);
int dl_power_of_two(uint32_t n);
float *dl_short_to_float(const int16_t *x, int len, int exponent, float *y);
int16_t dl_array_max_q_s16(const int16_t *x, int size);
int dl_float_to_short(const float *x, int len, int16_t *y, int out_exponent);
// float fftr2
float *dl_gen_fftr2_table_f32(int fft_point, uint32_t caps);
uint16_t *dl_gen_bitrev2r_table(int N, uint32_t caps, int *bitrev_size);
esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float *w);
esp_err_t dl_ifft2r_fc32_ansi(float *data, int N, float *w);
esp_err_t dl_bitrev2r_fc32_ansi(float *data, int N, uint16_t *reverse_tab, int bitrev_size);
// float fftr4
float *dl_gen_rfft_table_f32(int fft_point, uint32_t caps);
float *dl_gen_fft4r_table_f32(int fft_point, uint32_t caps);
uint16_t *dl_gen_bitrev4r_table(int N, uint32_t caps, int *bitrev_size);
esp_err_t dl_fft4r_fc32_ansi(float *data, int length, float *table, int table_size);
esp_err_t dl_ifft4r_fc32_ansi(float *data, int length, float *table, int table_size);
esp_err_t dl_bitrev4r_fc32_ansi(float *data, int N, uint16_t *reverse_tab, int bitrev_size);
esp_err_t dl_rfft_post_proc_fc32_ansi(float *data, int N, float *table);
esp_err_t dl_rfft_pre_proc_fc32_ansi(float *data, int N, float *table);
// int16 fft and rfft
int16_t *dl_gen_fft_table_sc16(int fft_point, uint32_t caps);
int16_t *dl_gen_rfft_table_s16(int fft_point, uint32_t caps);
esp_err_t dl_fft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift);
esp_err_t dl_fft2r_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_ifft2r_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift);
esp_err_t dl_ifft2r_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_bitrev2r_sc16_ansi(int16_t *data, int N);
esp_err_t dl_rfft_post_proc_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_rfft_pre_proc_sc16_ansi(int16_t *data, int N, int16_t *table);
esp_err_t dl_cplx2real_sc16_hp_ansi(int16_t *data, int N, int16_t *table, int *shift);
#if CONFIG_IDF_TARGET_ESP32
#define dl_fft2r_fc32 dl_fft2r_fc32_ae32_
#define dl_ifft2r_fc32 dl_ifft2r_fc32_ae32_
#define dl_fft4r_fc32 dl_fft4r_fc32_ae32_
#define dl_ifft4r_fc32 dl_ifft4r_fc32_ae32_
#elif CONFIG_IDF_TARGET_ESP32S3
#define dl_fft2r_fc32 dl_fft2r_fc32_aes3_
#define dl_ifft2r_fc32 dl_ifft2r_fc32_aes3_
#define dl_fft4r_fc32 dl_fft4r_fc32_aes3_
#define dl_ifft4r_fc32 dl_ifft4r_fc32_aes3_
#elif CONFIG_IDF_TARGET_ESP32P4
#define dl_fft2r_fc32 dl_fft2r_fc32_arp4_
#define dl_ifft2r_fc32 dl_ifft2r_fc32_arp4_
#define dl_fft4r_fc32 dl_fft4r_fc32_arp4_
#define dl_ifft4r_fc32 dl_ifft4r_fc32_arp4_
#else
#define dl_fft2r_fc32 dl_fft2r_fc32_ansi
#define dl_ifft2r_fc32 dl_ifft2r_fc32_ansi
#define dl_fft4r_fc32 dl_fft4r_fc32_ansi
#define dl_ifft4r_fc32 dl_ifft4r_fc32_ansi
#endif
#define dl_fft2r_sc16 dl_fft2r_sc16_ansi
#define dl_fft2r_sc16_hp dl_fft2r_sc16_hp_ansi
#define dl_ifft2r_sc16 dl_ifft2r_sc16_ansi
#define dl_ifft2r_sc16_hp dl_ifft2r_sc16_hp_ansi
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,30 @@
#pragma once
#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
// union to simplify access to the 16 bit data
typedef union dl_sc16_u {
struct {
int16_t re;
int16_t im;
};
uint32_t data;
} dl_sc16_t;
typedef union dl_fc32_u {
struct {
float re;
float im;
};
uint64_t data;
} dl_fc32_t;
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,36 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#if CONFIG_IDF_TARGET_ESP32
void dl_fft2r_fc32_ae32_(float *data, int N, float *table);
void dl_ifft2r_fc32_ae32_(float *data, int N, float *table);
void dl_fft4r_fc32_ae32_(float *data, int N, float *table, int table_size);
void dl_ifft4r_fc32_ae32_(float *data, int N, float *table, int table_size);
#elif CONFIG_IDF_TARGET_ESP32S3
void dl_fft2r_fc32_aes3_(float *data, int N, float *table);
void dl_ifft2r_fc32_aes3_(float *data, int N, float *table);
void dl_fft4r_fc32_aes3_(float *data, int N, float *table, int table_size);
void dl_ifft4r_fc32_aes3_(float *data, int N, float *table, int table_size);
// void test_radix2_fft_bf_s16(int16_t *data, int16_t *table, int16_t fft_point, int16_t log2n, int16_t);
// int test_radix2_fft_bf_s16_hp(int16_t *, int16_t *, int16_t, int16_t, int16_t);
// void test_radix2_bit_reverse(int16_t *data, int16_t cpx_point, int16_t log2n);
// void test_fftr_s16(int16_t *, int16_t *, int16_t);
// void test_ffti_s16(int16_t *, int16_t *, int16_t);
// void test_radix2_ifft_bf_s16(int16_t *, int16_t *, int16_t, int16_t, int16_t);
// int test_radix2_ifft_bf_s16_hp(int16_t *, int16_t *, int16_t, int16_t, int16_t);
#elif CONFIG_IDF_TARGET_ESP32P4
void dl_fft2r_fc32_arp4_(float *data, int N, float *table);
void dl_ifft2r_fc32_arp4_(float *data, int N, float *table);
void dl_fft4r_fc32_arp4_(float *data, int N, float *table, int table_size);
void dl_ifft4r_fc32_arp4_(float *data, int N, float *table, int table_size);
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,236 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.text
.align 4
.global dl_fft2r_fc32_ae32_
.type dl_fft2r_fc32_ae32_,@function
// The function implements the following C code:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N)
//{
// float *w = dl_fft_w_table_fc32;
//
// int ie, ia, m;
// float re_temp, im_temp;
// float c, s;
// int N2 = N;
// ie = 1;
// for (int N2 = N/2; N2 > 0; N2 >>= 1) {
// ia = 0;
// for (int j = 0; j < ie; j++) {
// c = w[2 * j];
// s = w[2 * j + 1];
// for (int i = 0; i < N2; i++) {
// m = ia + N2;
// re_temp = c * data[2 * m] + s * data[2 * m + 1];
// im_temp = c * data[2 * m + 1] - s * data[2 * m];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
// ia++;
// }
// ia += N2;
// }
// ie <<= 1;
// }
// return result;
//}
dl_fft2r_fc32_ae32_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.fft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
lsi f0, a10, 0
lsi f1, a10, 4
loopnez a6, .fft2r_l3
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
addx8 a13, a11, a2 // a13 - pointer for ia*2
lsi f4, a14, 0 // data[2 * m]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
lsi f5, a14, 4 // data[2 * m + 1]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
lsi f2, a13, 0 // data[2 * ia]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
lsi f3, a13, 4 // data[2 * ia + 1]
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ssi f8, a14, 0
ssi f10, a13, 0
ssi f9, a14, 4
ssi f11, a13, 4
.fft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .fft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .fft2r_l1 // Jump if > 0
// movi.n a2, 0 // return status ESP_OK
retw
.text
.align 4
.global dl_ifft2r_fc32_ae32_
.type dl_ifft2r_fc32_ae32_,@function
// The function implements the following C code:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N)
//{
// float *w = dl_fft_w_table_fc32;
//
// int ie, ia, m;
// float re_temp, im_temp;
// float c, s;
// int N2 = N;
// ie = 1;
// for (int N2 = N/2; N2 > 0; N2 >>= 1) {
// ia = 0;
// for (int j = 0; j < ie; j++) {
// c = w[2 * j];
// s = -w[2 * j + 1];
// for (int i = 0; i < N2; i++) {
// m = ia + N2;
// re_temp = c * data[2 * m] + s * data[2 * m + 1];
// im_temp = c * data[2 * m + 1] - s * data[2 * m];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
// ia++;
// }
// ia += N2;
// }
// ie <<= 1;
// }
// return result;
//}
dl_ifft2r_fc32_ae32_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.ifft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
lsi f0, a10, 0
lsi f1, a10, 4
// CHANGE: Negate the imaginary part of twiddle factors
neg.s f1, f1
loopnez a6, .ifft2r_l3
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
addx8 a13, a11, a2 // a13 - pointer for ia*2
lsi f4, a14, 0 // data[2 * m]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
lsi f5, a14, 4 // data[2 * m + 1]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
lsi f2, a13, 0 // data[2 * ia]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
lsi f3, a13, 4 // data[2 * ia + 1]
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ssi f8, a14, 0
ssi f10, a13, 0
ssi f9, a14, 4
ssi f11, a13, 4
.ifft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .ifft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .ifft2r_l1 // Jump if > 0
// movi.n a2, 0 // return status ESP_OK
retw

View File

@@ -0,0 +1,332 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.section .text # placed in IRAM instead of FLASH .text
.global dl_fft4r_fc32_ae32_
.type dl_fft4r_fc32_ae32_,@function
// The function implements the following C code:
// esp_err_t dl_fft4r_fc32_ansi_(float *data, int length, float *table, int table_size)
// {
// if (0 == dl_fft4r_initialized) {
// return ESP_ERR_DSP_UNINITIALIZED;
// }
//
// uint log2N = dl_power_of_two(length);
// if ((log2N & 0x01) != 0) {
// return ESP_ERR_DSP_INVALID_LENGTH;
// }
// uint log4N = log2N >> 1;
//
// fc32_t bfly[4];
// uint m = 2;
// uint wind_step = table_size / length;
// while (1) { ///radix 4
// if (log4N == 0) {
// break;
// }
// length = length >> 2;
// for (int j = 0; j < m; j += 2) { // j: which FFT of this step
// int start_index = j * (length << 1); // n: n-point FFT
//
// fc32_t *ptrc0 = (fc32_t *)data + start_index;
// fc32_t *ptrc1 = ptrc0 + length;
// fc32_t *ptrc2 = ptrc1 + length;
// fc32_t *ptrc3 = ptrc2 + length;
//
// fc32_t *winc0 = (fc32_t *)table;
// fc32_t *winc1 = winc0;
// fc32_t *winc2 = winc0;
//
// for (int k = 0; k < length; k++) {
// fc32_t in0 = *ptrc0;
// fc32_t in2 = *ptrc2;
// fc32_t in1 = *ptrc1;
// fc32_t in3 = *ptrc3;
//
// bfly[0].re = in0.re + in2.re + in1.re + in3.re;
// bfly[0].im = in0.im + in2.im + in1.im + in3.im;
//
// bfly[1].re = in0.re - in2.re + in1.im - in3.im;
// bfly[1].im = in0.im - in2.im - in1.re + in3.re;
//
// bfly[2].re = in0.re + in2.re - in1.re - in3.re;
// bfly[2].im = in0.im + in2.im - in1.im - in3.im;
//
// bfly[3].re = in0.re - in2.re - in1.im + in3.im;
// bfly[3].im = in0.im - in2.im + in1.re - in3.re;
//
// *ptrc0 = bfly[0];
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
//
// winc0 += 1 * wind_step;
// winc1 += 2 * wind_step;
// winc2 += 3 * wind_step;
//
// ptrc0++;
// ptrc1++;
// ptrc2++;
// ptrc3++;
// }
// }
// m = m << 2;
// wind_step = wind_step << 2;
// log4N--;
// }
// return ESP_OK;
// }
// esp_err_t dl_fft4r_fc32_ae32_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
//.ret_DSP_INVALID_LENGTH:
// movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
// retw.n
.align 4
dl_fft4r_fc32_ae32_:
entry a1, 16 # no auto vars on stack
// bltui a3, 4, .ret_DSP_INVALID_LENGTH # if N < 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
// addi.n a6, a3, -1
// and a6, a3, a6
// bnez a6, .ret_DSP_INVALID_LENGTH # if N not power of 2 : return(ESP_ERR_DSP_INVALID_LENGTH)
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
// bbsi a6, 0, .ret_DSP_INVALID_LENGTH # if N not power of 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .bf4_loop_end # for (uint k = 0; k < N; k++)
lsi f1, a12, 4 # f1 = in0.im = *(p0 + 1)
lsi f3, a14, 4 # f3 = in2.im = *(p2 + 1)
lsi f0, a12, 0 # f0 = in0.re = *p0
lsi f2, a14, 0 # f2 = in2.re = *p2
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
lsi f1, a13, 4 # f1 = in1.im = *(p1 + 1)
lsi f3, a15, 4 # f3 = in3.im = *(p3 + 1)
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f0, a13, 0 # f0 = in1.re = *p1
lsi f2, a15, 0 # f2 = in3.re = *p3
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
add.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im
sub.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
sub.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re
add.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re
ssi f5, a12, 4 # *(p0 + 1) = f5 = bf0.im
ssip f4, a12, 8 # *p0 = f4 = bf0.re , p0 += 2
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
addx4 a9, a6, a9 # w0 += m
addx8 a10, a6, a10 # w1 += 2 * m
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * m
msub.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im
madd.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im
msub.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im
madd.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im
msub.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im
madd.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im
ssi f5, a13, 4 # *(p1 + 1) = f5
ssip f4, a13, 8 # *p1 = f4, p1 += 2
ssi f9, a14, 4 # *(p2 + 1) = f9
ssip f8, a14, 8 # *p2 = f8, p2 += 2
ssi f11, a15, 4 # *(p3 + 1) = f11
ssip f10, a15, 8 # *p3 = f10, p3 += 2
.bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .stage_next # if j >= m
j .group
.stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .stage # if log4N > 0
// movi.n a2, DSP_OK # return(DSP_OK)
retw
.section .text # placed in IRAM instead of FLASH .text
.global dl_ifft4r_fc32_ae32_
.type dl_ifft4r_fc32_ae32_,@function
// esp_err_t dl_ifft4r_fc32_ae32_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
//.ret_DSP_INVALID_LENGTH:
// movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
// retw.n
.align 4
dl_ifft4r_fc32_ae32_:
entry a1, 16 # no auto vars on stack
// bltui a3, 4, .ret_DSP_INVALID_LENGTH # if N < 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
// addi.n a6, a3, -1
// and a6, a3, a6
// bnez a6, .ret_DSP_INVALID_LENGTH # if N not power of 2 : return(ESP_ERR_DSP_INVALID_LENGTH)
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
// bbsi a6, 0, .ret_DSP_INVALID_LENGTH # if N not power of 4 : return(ESP_ERR_DSP_INVALID_LENGTH)
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.ifft_stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.ifft_group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .inv_bf4_loop_end # for (uint k = 0; k < N; k++)
lsi f1, a12, 4 # f1 = in0.im = *(p0 + 1)
lsi f3, a14, 4 # f3 = in2.im = *(p2 + 1)
lsi f0, a12, 0 # f0 = in0.re = *p0
lsi f2, a14, 0 # f2 = in2.re = *p2
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
lsi f1, a13, 4 # f1 = in1.im = *(p1 + 1)
lsi f3, a15, 4 # f3 = in3.im = *(p3 + 1)
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f0, a13, 0 # f0 = in1.re = *p1
lsi f2, a15, 0 # f2 = in3.re = *p3
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
sub.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im
add.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
add.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re
sub.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re
ssi f5, a12, 4 # *(p0 + 1) = f5 = bf0.im
ssip f4, a12, 8 # *p0 = f4 = bf0.re , p0 += 2
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
addx4 a9, a6, a9 # w0 += m
addx8 a10, a6, a10 # w1 += 2 * m
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * m
madd.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im
msub.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im
madd.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im
msub.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im
madd.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im
msub.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im
ssi f5, a13, 4 # *(p1 + 1) = f5
ssip f4, a13, 8 # *p1 = f4, p1 += 2
ssi f9, a14, 4 # *(p2 + 1) = f9
ssip f8, a14, 8 # *p2 = f8, p2 += 2
ssi f11, a15, 4 # *(p3 + 1) = f11
ssip f10, a15, 8 # *p3 = f10, p3 += 2
.inv_bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .ifft_stage_next # if j >= m
j .ifft_group
.ifft_stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .ifft_stage # if log4N > 0
// movi.n a2, DSP_OK # return(DSP_OK)
retw

View File

@@ -0,0 +1,153 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This is matrix multipliction function for esp32p4 processor.
.text
.align 4
.global dl_fft2r_fc32_arp4_
.type dl_fft2r_fc32_arp4_,@function
dl_fft2r_fc32_arp4_:
//esp_err_t dl_fft2r_fc32_arp4_(float *data, int N, float* dl_fft_w_table_fc32)
add sp,sp,-16
#
srli t6, a1, 1 // a6 = N2 = N/2
li t0, 1 // a7 - ie
.fft2r_l1:
li t1, 0 // a8 - j
li t4, 0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
slli t3, t1, 3 // a10 = j<<3 // shift for cos () -- c = w[2 * j];
add t3, t3, a2 // a10 - pointer to cos
flw fa0, 0(t3)
flw fa1, 4(t3)
esp.lp.setup 0, t6, .fft2r_l3 // .fft2r_l3 - label to the last executed instruction
add t5, t4, t6 // a12 = m = ia + N2
slli a4, t5, 3 // a14 - pointer for m*2
slli a3, t4, 3 // a13 - pointer for ia*2
add a4, a4, a0 // pointers to data arrays
add a3, a3, a0 //
flw fa4, 0(a4)
flw fa5, 4(a4)
flw fa2, 0(a3)
flw fa3, 4(a3)
fmul.s ft6, fa0, fa4 // re_temp = c * data[2 * m]
fmul.s ft7, fa0, fa5 // im_temp = c * data[2 * m + 1]
fmadd.s ft6, fa1, fa5, ft6 // re_temp += s * data[2 * m + 1];
fnmsub.s ft7, fa1, fa4, ft7 // im_temp -= s * data[2 * m];
fsub.s ft8, fa2, ft6 // = data[2 * ia] - re_temp;
fsub.s ft9, fa3, ft7 // = data[2 * ia + 1] - im_temp;
fadd.s ft10, fa2, ft6 // = data[2 * ia] + re_temp;
fadd.s ft11, fa3, ft7 // = data[2 * ia + 1] + im_temp;
fsw ft8, 0(a4)
fsw ft9, 4(a4)
fsw ft10, 0(a3)
fsw ft11, 4(a3)
.fft2r_l3: add t4, t4, 1 // ia++
add t4, t4, t6
add t1, t1, 1 // j++
BNE t1, t0, .fft2r_l2
slli t0, t0, 1 // ie = ie<<1
srli t6, t6, 1 // a6 = a6>>1
BNEZ t6, .fft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret
// This is matrix multipliction function for esp32p4 processor.
.text
.align 4
.global dl_ifft2r_fc32_arp4_
.type dl_ifft2r_fc32_arp4_,@function
dl_ifft2r_fc32_arp4_:
//esp_err_t dl_ifft2r_fc32_arp4_(float *data, int N, float* dl_fft_w_table_fc32)
add sp,sp,-16
#
srli t6, a1, 1 // a6 = N2 = N/2
li t0, 1 // a7 - ie
.ifft2r_l1:
li t1, 0 // a8 - j
li t4, 0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
slli t3, t1, 3 // a10 = j<<3 // shift for cos () -- c = w[2 * j];
add t3, t3, a2 // a10 - pointer to cos
flw fa0, 0(t3)
flw fa1, 4(t3)
// CHANGE: Negate the imaginary part of twiddle factors (complex conjugate)
fneg.s fa1, fa1 // s = -s (since w^-1 = w*)
esp.lp.setup 0, t6, .ifft2r_l3 // .fft2r_l3 - label to the last executed instruction
add t5, t4, t6 // a12 = m = ia + N2
slli a4, t5, 3 // a14 - pointer for m*2
slli a3, t4, 3 // a13 - pointer for ia*2
add a4, a4, a0 // pointers to data arrays
add a3, a3, a0 //
flw fa4, 0(a4)
flw fa5, 4(a4)
flw fa2, 0(a3)
flw fa3, 4(a3)
fmul.s ft6, fa0, fa4 // re_temp = c * data[2 * m]
fmul.s ft7, fa0, fa5 // im_temp = c * data[2 * m + 1]
fmadd.s ft6, fa1, fa5, ft6 // re_temp += s * data[2 * m + 1];
fnmsub.s ft7, fa1, fa4, ft7 // im_temp -= s * data[2 * m];
fsub.s ft8, fa2, ft6 // = data[2 * ia] - re_temp;
fsub.s ft9, fa3, ft7 // = data[2 * ia + 1] - im_temp;
fadd.s ft10, fa2, ft6 // = data[2 * ia] + re_temp;
fadd.s ft11, fa3, ft7 // = data[2 * ia + 1] + im_temp;
fsw ft8, 0(a4)
fsw ft9, 4(a4)
fsw ft10, 0(a3)
fsw ft11, 4(a3)
.ifft2r_l3: add t4, t4, 1 // ia++
add t4, t4, t6
add t1, t1, 1 // j++
BNE t1, t0, .ifft2r_l2
slli t0, t0, 1 // ie = ie<<1
srli t6, t6, 1 // a6 = a6>>1
BNEZ t6, .ifft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret

View File

@@ -0,0 +1,304 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
.align 4
.global dl_fft4r_fc32_arp4_
.type dl_fft4r_fc32_arp4_,@function
dl_fft4r_fc32_arp4_:
//esp_err_t dl_fft4r_fc32_arp4_(float *data, int N, float *table, int table_size)
// table_size - a3
// m - t0
// j - t1
add sp,sp,-16
#
srli t6, a1, 1 // t6 = log4N = N/2
li t0, 2 // t0 - m
div a3, a3, a1 // wind_step = table_size / N
slli a3, a3, 3 // wind_step = complex step = 8 bytes
.fft2r_l1:
li t1, 0 // t1 - j
srli a1, a1, 2 // a1 = length = length >> 2;
.fft2r_l2: // loop for j, t1 - j
slli t2, a1, 4 // t2 = length << 1 << 3 (8 bytes for one complex sample)
slli t3, a1, 3 // t2 = length << 1 << 3 (8 bytes for one complex sample)
// start_index = j * (length << 1); // n: n-point FFT
mul t2,t2,t1
add a4, a0, t2 // fc32_t *ptrc0
add a5, a4, t3 // fc32_t *ptrc1
add a6, a5, t3 // fc32_t *ptrc2
add a7, a6, t3 // fc32_t *ptrc3
# flw fa0, 0(a4)
# fsw fa0, 0(t3)
# add t3, t3, 4
mv t2, a2 // winc0
mv t3, a2 // winc0
mv t4, a2 // winc0
esp.lp.setup 0, a1, .fft2r_l3 // .fft2r_l3 - label to the last executed instruction
flw fa0, 0(a4) // in0.re
flw fa4, 0(a6) // in2.re
fadd.s ft0, fa0, fa4 // in0.re + in2.re
flw fa1, 4(a4) // in0.im
fsub.s ft1, fa0, fa4 // in0.re - in2.re
flw fa5, 4(a6) // in2.im
fadd.s ft2, fa1, fa5 // in0.im + in2.im
flw fa2, 0(a5) // in1.re
fsub.s ft3, fa1, fa5 // in0.im - in2.im
flw fa6, 0(a7) // in3.re
fadd.s ft4, fa2, fa6 // in1.re + in3.re
flw fa3, 4(a5) // in1.im
fsub.s ft5, fa2, fa6 // in1.re - in3.re
flw fa7, 4(a7) // in3.im
fadd.s ft6, fa3, fa7 // in1.im + in3.im
fsub.s ft7, fa3, fa7 // in1.im - in3.im
# bfly[0].re = ft0 + ft4;
fadd.s fa0, ft0, ft4;
# bfly[0].im = ft2 + ft6;
fadd.s fa1, ft2, ft6;
# bfly[1].re = ft1 + ft7;
fadd.s fa2, ft1, ft7;
# bfly[1].im = ft3 - ft5;
fsub.s fa3, ft3, ft5;
# bfly[2].re = ft0 - ft5;
fsub.s fa4, ft0, ft4;
flw ft0, 0(t2) // winc0->re
# bfly[2].im = ft2 - ft7;
fsub.s fa5, ft2, ft6;
flw ft2, 0(t3) // winc1->re
# bfly[3].re = ft1 - ft6;
fsub.s fa6, ft1, ft7;
flw ft1, 4(t2) // winc0->im
# bfly[3].im = ft3 + ft5;
fadd.s fa7, ft3, ft5;
// *ptrc0 = bfly[0];
fsw fa0, 0(a4) // in0.re
fsw fa1, 4(a4) // in0.im
flw ft3, 4(t3) // winc1->im
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
fmul.s fa0, fa2, ft0
add t2, t2, a3 // winc0 += 1 * wind_step;
fmul.s fa1, fa3, ft0
fmul.s ft0, fa4, ft2
fmul.s ft2, fa5, ft2
flw ft4, 0(t4) // winc2->re
flw ft5, 4(t4) // winc3->im
fmadd.s fa0, fa3, ft1, fa0
add t3, t3, a3 // winc1 += 2 * wind_step;
fnmsub.s fa1, fa2, ft1, fa1
add t3, t3, a3 //
fmul.s fa2, fa6, ft4
fmul.s fa3, fa7, ft4
add t4, t4, a3 // winc2 += 3 * wind_step;
fmadd.s ft0, fa5, ft3, ft0
add t4, t4, a3 //
fnmsub.s ft2, fa4, ft3, ft2
fmadd.s ft3, fa7, ft5, fa2
add t4, t4, a3 //
fnmsub.s fa3, fa6, ft5, fa3
fsw fa0, 0(a5) // in1.re
add a4, a4, 8
fsw fa1, 4(a5) // in1.im
add a5, a5, 8
fsw ft0, 0(a6) // in2.re
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
fsw ft2, 4(a6) // in2.re
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
add a6, a6, 8
fsw ft3, 0(a7) // in2.re
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
fsw fa3, 4(a7) // in2.re
add a7, a7, 8
// Temp solution
.fft2r_l3: nop
add t1, t1, 2 // j+=2
BNE t1, t0, .fft2r_l2
slli t0, t0, 2 // t0 = m = m<<2
srli t6, t6, 2 // t6 = log4N >>= 2
slli a3, a3, 2 // wind_step = wind_step << 2;
BNEZ t6, .fft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret
.text
.align 4
.global dl_ifft4r_fc32_arp4_
.type dl_ifft4r_fc32_arp4_,@function
dl_ifft4r_fc32_arp4_:
//esp_err_t dl_ifft4r_fc32_arp4_(float *data, int N, float *table, int table_size)
// table_size - a3
// m - t0
// j - t1
add sp,sp,-16
#
srli t6, a1, 1 // t6 = log4N = N/2
li t0, 2 // t0 - m
div a3, a3, a1 // wind_step = table_size / N
slli a3, a3, 3 // wind_step = complex step = 8 bytes
.ifft2r_l1:
li t1, 0 // t1 - j
srli a1, a1, 2 // a1 = length = length >> 2;
.ifft2r_l2: // loop for j, t1 - j
slli t2, a1, 4 // t2 = length << 1 << 3 (8 bytes for one complex sample)
slli t3, a1, 3 // t2 = length << 1 << 3 (8 bytes for one complex sample)
// start_index = j * (length << 1); // n: n-point FFT
mul t2,t2,t1
add a4, a0, t2 // fc32_t *ptrc0
add a5, a4, t3 // fc32_t *ptrc1
add a6, a5, t3 // fc32_t *ptrc2
add a7, a6, t3 // fc32_t *ptrc3
# flw fa0, 0(a4)
# fsw fa0, 0(t3)
# add t3, t3, 4
mv t2, a2 // winc0
mv t3, a2 // winc0
mv t4, a2 // winc0
esp.lp.setup 0, a1, .ifft2r_l3 // .ifft2r_l3 - label to the last executed instruction
flw fa0, 0(a4) // in0.re
flw fa4, 0(a6) // in2.re
fadd.s ft0, fa0, fa4 // in0.re + in2.re
flw fa1, 4(a4) // in0.im
fsub.s ft1, fa0, fa4 // in0.re - in2.re
flw fa5, 4(a6) // in2.im
fadd.s ft2, fa1, fa5 // in0.im + in2.im
flw fa2, 0(a5) // in1.re
fsub.s ft3, fa1, fa5 // in0.im - in2.im
flw fa6, 0(a7) // in3.re
fadd.s ft4, fa2, fa6 // in1.re + in3.re
flw fa3, 4(a5) // in1.im
fsub.s ft5, fa2, fa6 // in1.re - in3.re
flw fa7, 4(a7) // in3.im
fadd.s ft6, fa3, fa7 // in1.im + in3.im
fsub.s ft7, fa3, fa7 // in1.im - in3.im
# bfly[0].re = ft0 + ft4;
fadd.s fa0, ft0, ft4;
# bfly[0].im = ft2 + ft6;
fadd.s fa1, ft2, ft6;
# bfly[1].re = ft1 - ft7;
fsub.s fa2, ft1, ft7;
# bfly[1].im = ft3 + ft5;
fadd.s fa3, ft3, ft5;
# bfly[2].re = ft0 - ft5;
fsub.s fa4, ft0, ft4;
flw ft0, 0(t2) // winc0->re
# bfly[2].im = ft2 - ft7;
fsub.s fa5, ft2, ft6;
flw ft2, 0(t3) // winc1->re
# bfly[3].re = ft1 + ft6;
fadd.s fa6, ft1, ft7;
flw ft1, 4(t2) // winc0->im
# bfly[3].im = ft3 - ft5;
fsub.s fa7, ft3, ft5;
// *ptrc0 = bfly[0];
fsw fa0, 0(a4) // in0.re
fsw fa1, 4(a4) // in0.im
flw ft3, 4(t3) // winc1->im
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
fmul.s fa0, fa2, ft0
add t2, t2, a3 // winc0 += 1 * wind_step;
fmul.s fa1, fa3, ft0
fmul.s ft0, fa4, ft2
fmul.s ft2, fa5, ft2
flw ft4, 0(t4) // winc2->re
flw ft5, 4(t4) // winc3->im
fnmsub.s fa0, fa3, ft1, fa0
add t3, t3, a3 // winc1 += 2 * wind_step;
fmadd.s fa1, fa2, ft1, fa1
add t3, t3, a3 //
fmul.s fa2, fa6, ft4
fmul.s fa3, fa7, ft4
add t4, t4, a3 // winc2 += 3 * wind_step;
fnmsub.s ft0, fa5, ft3, ft0
add t4, t4, a3 //
fmadd.s ft2, fa4, ft3, ft2
fnmsub.s ft3, fa7, ft5, fa2
add t4, t4, a3 //
fmadd.s fa3, fa6, ft5, fa3
fsw fa0, 0(a5) // in1.re
add a4, a4, 8
fsw fa1, 4(a5) // in1.im
add a5, a5, 8
fsw ft0, 0(a6) // in2.re
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
fsw ft2, 4(a6) // in2.re
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
add a6, a6, 8
fsw ft3, 0(a7) // in2.re
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
fsw fa3, 4(a7) // in2.re
add a7, a7, 8
// Temp solution
.ifft2r_l3: nop
add t1, t1, 2 // j+=2
BNE t1, t0, .ifft2r_l2
slli t0, t0, 2 // t0 = m = m<<2
srli t6, t6, 2 // t6 = log4N >>= 2
slli a3, a3, 2 // wind_step = wind_step << 2;
BNEZ t6, .ifft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret

View File

@@ -0,0 +1,197 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.text
.align 4
.global dl_fft2r_fc32_aes3_
.type dl_fft2r_fc32_aes3_,@function
// The function implements the following C code:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N)
//{
// float *w = dl_fft_w_table_fc32;
//
// int ie, ia, m;
// float re_temp, im_temp;
// float c, s;
// int N2 = N;
// ie = 1;
// for (int N2 = N/2; N2 > 0; N2 >>= 1) {
// ia = 0;
// for (int j = 0; j < ie; j++) {
// c = w[2 * j];
// s = w[2 * j + 1];
// for (int i = 0; i < N2; i++) {
// m = ia + N2;
// re_temp = c * data[2 * m] + s * data[2 * m + 1];
// im_temp = c * data[2 * m + 1] - s * data[2 * m];
// data[2 * m] = data[2 * ia] - re_temp;
// data[2 * m + 1] = data[2 * ia + 1] - im_temp;
// data[2 * ia] = data[2 * ia] + re_temp;
// data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
// ia++;
// }
// ia += N2;
// }
// ie <<= 1;
// }
// return result;
//}
dl_fft2r_fc32_aes3_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.ifft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
ee.ldf.64.ip f1, f0, a10, 0
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
loopnez a6, .ifft2r_l3
ee.ldf.64.ip f5, f4, a14, 0 // data[2 * m], data[2 * m + 1]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
addx8 a13, a11, a2 // a13 - pointer for ia*2
ee.ldf.64.ip f3, f2, a13, 0 // data[2 * ia], data[2 * ia + 1]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
add.n a12, a11, a6 // a12 = m = ia + N2
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ee.stf.64.ip f9, f8, a14, 0
addx8 a14, a12, a2 // a14 - pointer for m*2
ee.stf.64.ip f11, f10, a13, 0
.ifft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .ifft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .ifft2r_l1 // Jump if > 0
retw
.text
.align 4
.global dl_ifft2r_fc32_aes3_
.type dl_ifft2r_fc32_aes3_,@function
dl_ifft2r_fc32_aes3_:
//esp_err_t dl_fft2r_fc32_ansi(float *data, int N, float* dl_fft_w_table_fc32)
entry a1, 16
// Array increment for floating point data should be 4
// data - a2
// N - a3
// dl_fft_w_table_fc32 - a4
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
// a7 - ie
// a8 - j
// a10 - (j*2)<<2, or a10 - j<<3
// f0 - c or w[2 * j]
// f1 - s or w[2 * j + 1]
// a11 - ia
// a12 - m
// a13 - ia pointer
// a14 - m pointer
// f6 - re_temp
// f7 - im_temp
srli a6, a3, 1 // a6 = N2 = N/2
movi.n a7, 1 // a7 - ie
.fft2r_l1:
movi.n a8, 0 // a8 - j
movi.n a11,0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
addx8 a10, a8, a4 // a8 - shift for cos () -- c = w[2 * j]; -- pointer to cos
ee.ldf.64.ip f1, f0, a10, 0
// CHANGE: Negate the imaginary part of twiddle factors
neg.s f1, f1
add.n a12, a11, a6 // a12 = m = ia + N2
addx8 a14, a12, a2 // a14 - pointer for m*2
loopnez a6, .fft2r_l3
ee.ldf.64.ip f5, f4, a14, 0 // data[2 * m], data[2 * m + 1]
mul.s f6, f0, f4 // re_temp = c * data[2 * m]
mul.s f7, f0, f5 // im_temp = c * data[2 * m + 1]
addx8 a13, a11, a2 // a13 - pointer for ia*2
ee.ldf.64.ip f3, f2, a13, 0 // data[2 * ia], data[2 * ia + 1]
madd.s f6, f1, f5 // re_temp += s * data[2 * m + 1];
msub.s f7, f1, f4 // im_temp -= s * data[2 * m];
addi a11, a11, 1 // ia++
add.n a12, a11, a6 // a12 = m = ia + N2
sub.s f8, f2, f6 // = data[2 * ia] - re_temp;
sub.s f9, f3, f7 // = data[2 * ia + 1] - im_temp;
add.s f10, f2, f6 // = data[2 * ia] + re_temp;
add.s f11, f3, f7 // = data[2 * ia + 1] + im_temp;
ee.stf.64.ip f9, f8, a14, 0
addx8 a14, a12, a2 // a14 - pointer for m*2
ee.stf.64.ip f11, f10, a13, 0
.fft2r_l3:
add.n a11, a11, a6
addi.n a8, a8, 1 // j++
bne a8, a7, .fft2r_l2
slli a7, a7, 1 // ie = ie<<1
// main loop: for (int k = N/2; k > 0; k >>= 1)
srli a6, a6, 1 // a6 = a6>>1
bnez a6, .fft2r_l1 // Jump if > 0
retw

View File

@@ -0,0 +1,288 @@
/*
* SPDX-FileCopyrightText: 2018-2025 Espressif Systems (Shanghai) CO LTD
* SPDX-FileContributor: 2024 f4lcOn @ Libera Chat IRC
*
* SPDX-License-Identifier: Apache-2.0
*/
.section .text
.global dl_fft4r_fc32_aes3_
.type dl_fft4r_fc32_aes3_,@function
// The function implements the following C code:
// esp_err_t dl_fft4r_fc32_ansi_(float *data, int length, float *table, int table_size)
// {
// if (0 == dl_fft4r_initialized) {
// return ESP_ERR_DSP_UNINITIALIZED;
// }
//
// uint log2N = dl_power_of_two(length);
// if ((log2N & 0x01) != 0) {
// return ESP_ERR_DSP_INVALID_LENGTH;
// }
// uint log4N = log2N >> 1;
//
// fc32_t bfly[4];
// uint m = 2;
// uint wind_step = table_size / length;
// while (1) { ///radix 4
// if (log4N == 0) {
// break;
// }
// length = length >> 2;
// for (int j = 0; j < m; j += 2) { // j: which FFT of this step
// int start_index = j * (length << 1); // n: n-point FFT
//
// fc32_t *ptrc0 = (fc32_t *)data + start_index;
// fc32_t *ptrc1 = ptrc0 + length;
// fc32_t *ptrc2 = ptrc1 + length;
// fc32_t *ptrc3 = ptrc2 + length;
//
// fc32_t *winc0 = (fc32_t *)table;
// fc32_t *winc1 = winc0;
// fc32_t *winc2 = winc0;
//
// for (int k = 0; k < length; k++) {
// fc32_t in0 = *ptrc0;
// fc32_t in2 = *ptrc2;
// fc32_t in1 = *ptrc1;
// fc32_t in3 = *ptrc3;
//
// bfly[0].re = in0.re + in2.re + in1.re + in3.re;
// bfly[0].im = in0.im + in2.im + in1.im + in3.im;
//
// bfly[1].re = in0.re - in2.re + in1.im - in3.im;
// bfly[1].im = in0.im - in2.im - in1.re + in3.re;
//
// bfly[2].re = in0.re + in2.re - in1.re - in3.re;
// bfly[2].im = in0.im + in2.im - in1.im - in3.im;
//
// bfly[3].re = in0.re - in2.re - in1.im + in3.im;
// bfly[3].im = in0.im - in2.im + in1.re - in3.re;
//
// *ptrc0 = bfly[0];
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
//
// winc0 += 1 * wind_step;
// winc1 += 2 * wind_step;
// winc2 += 3 * wind_step;
//
// ptrc0++;
// ptrc1++;
// ptrc2++;
// ptrc3++;
// }
// }
// m = m << 2;
// wind_step = wind_step << 2;
// log4N--;
// }
// return ESP_OK;
// }
// esp_err_t dl_fft4r_fc32_aes3_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
.align 4
dl_fft4r_fc32_aes3_:
entry a1, 16 # no auto vars on stack
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .bf4_loop_end # for (uint k = 0; k < N; k++)
ee.ldf.64.ip f1, f0, a12, 0 # f0 = in0.re = *p0, f1 = in0.im = *(p0 + 1)
ee.ldf.64.ip f3, f2, a14, 0 # f2 = in2.re = *p2, f3 = in2.im = *(p2 + 1)
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
ee.ldf.64.ip f1, f0, a13, 0 # f0 = in1.re = *p1, f1 = in1.im = *(p1 + 1)
ee.ldf.64.ip f3, f2, a15, 0 # f2 = in3.re = *p3, f3 = in3.im = *(p3 + 1)
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
add.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im
sub.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
sub.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re
add.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
ee.stf.64.ip f5, f4, a12, 8 # *p0 = f4 = bf0.re, *(p0 + 1) = f5 = bf0.im, p0 += 2
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
msub.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im
madd.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im
msub.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im
madd.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im
msub.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im
madd.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im
addx4 a9, a6, a9 # w0 += w_step
addx8 a10, a6, a10 # w1 += 2 * w_step
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * w_step
ee.stf.64.ip f5, f4, a13, 8 # *p1 = f4, *(p1 + 1) = f5, p1 += 2
ee.stf.64.ip f9, f8, a14, 8 # *p2 = f8, *(p2 + 1) = f9, p2 += 2
ee.stf.64.ip f11, f10, a15, 8 # *p3 = f10, *(p3 + 1) = f11, p3 += 2
.bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .stage_next # if j >= m
j .group
.stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .stage # if log4N > 0
retw
.section .text
.global dl_ifft4r_fc32_aes3_
.type dl_ifft4r_fc32_aes3_,@function
// esp_err_t dl_ifft4r_fc32_aes3_(data, N, dl_fft4r_w_table_fc32, dl_fft4r_w_table_size)
.align 4
dl_ifft4r_fc32_aes3_:
entry a1, 16 # no auto vars on stack
nsau a6, a3 # inline dl_power_of_two(N)
movi.n a7, 31
xor a6, a6, a7
srli a7, a6, 1 # log4N = dl_power_of_two(N) >> 1;
addi.n a6, a6, -1
ssr a6
srl a6, a5 # w_step = table_size >> (dl_power_of_two(N) - 1)
movi.n a5, 2 # m = 2
.ifft_stage:
srli a3, a3, 2 # N >>= 2
movi.n a8, 0 # j = 0
.ifft_group:
mov.n a9, a4 # w0 = w
mov.n a10, a4 # w1 = w
mov.n a11, a4 # w2 = w
mul16u a12, a8, a3
slli a12, a12, 1 # start_index = (j * N) << 1
addx8 a12, a12, a2 # p0 = data + (start_index << 1)
addx8 a13, a3, a12 # p1 = p0 + (N << 1)
addx8 a14, a3, a13 # p2 = p1 + (N << 1)
addx8 a15, a3, a14 # p3 = p2 + (N << 1)
loopnez a3, .inv_bf4_loop_end # for (uint k = 0; k < N; k++)
ee.ldf.64.ip f1, f0, a12, 0 # f0 = in0.re = *p0, f1 = in0.im = *(p0 + 1)
ee.ldf.64.ip f3, f2, a14, 0 # f2 = in2.re = *p2, f3 = in2.im = *(p2 + 1)
add.s f5, f1, f3 # f5 = in0.im + in2.im
sub.s f7, f1, f3 # f7 = in0.im - in2.im
add.s f4, f0, f2 # f4 = in0.re + in2.re
sub.s f6, f0, f2 # f6 = in0.re - in2.re
ee.ldf.64.ip f1, f0, a13, 0 # f0 = in1.re = *p1, f1 = in1.im = *(p1 + 1)
ee.ldf.64.ip f3, f2, a15, 0 # f2 = in3.re = *p3, f3 = in3.im = *(p3 + 1)
add.s f9, f1, f3 # f9 = in1.im + in3.im
sub.s f11, f1, f3 # f11 = in1.im - in3.im
lsi f12, a9, 0 # f12 = w0->re
lsi f13, a10, 0 # f13 = w1->re
lsi f14, a11, 0 # f14 = w2->re
add.s f8, f0, f2 # f8 = in1.re + in3.re
sub.s f10, f0, f2 # f10 = in1.re - in3.re
sub.s f1, f5, f9 # f1 = bf2.im = in0.im + in2.im - in1.im - in3.im
add.s f5, f5, f9 # f5 = bf0.im = in0.im + in2.im + in1.im + in3.im
sub.s f2, f6, f11 # f2 = bf1.re = in0.re - in2.re + in1.im - in3.im //ifft change
add.s f6, f6, f11 # f6 = bf3.re = in0.re - in2.re - in1.im + in3.im //ifft change
sub.s f0, f4, f8 # f0 = bf2.re = in0.re + in2.re - in1.re - in3.re
add.s f4, f4, f8 # f4 = bf0.re = in0.re + in2.re + in1.re + in3.re
add.s f3, f7, f10 # f3 = bf1.im = in0.im - in2.im - in1.re + in3.re //ifft change
sub.s f7, f7, f10 # f7 = bf3.im = in0.im - in2.im + in1.re - in3.re //ifft change
mul.s f10, f6, f14 # f10 = bf3.re * w2->re
ee.stf.64.ip f5, f4, a12, 8 # *p0 = f4 = bf0.re, *(p0 + 1) = f5 = bf0.im, p0 += 2
mul.s f4, f2, f12 # f4 = bf1.re * w0->re
mul.s f11, f7, f14 # f11 = bf3.im * w2->re
mul.s f5, f3, f12 # f5 = bf1.im * w0->re
mul.s f8, f0, f13 # f8 = bf2.re * w1->re
mul.s f9, f1, f13 # f9 = bf2.im * w1->re
lsi f12, a9, 4 # f12 = w0->im
lsi f13, a10, 4 # f13 = w1->im
lsi f14, a11, 4 # f14 = w2->im
madd.s f5, f2, f12 # f5 = bf1.im * w0->re - bf1.re * w0->im //ifft change
msub.s f4, f3, f12 # f4 = bf1.re * w0->re + bf1.im * w0->im //ifft change
madd.s f9, f0, f13 # f9 = bf2.im * w1->re - bf2.re * w1->im //ifft change
msub.s f8, f1, f13 # f8 = bf2.re * w1->re + bf2.im * w1->im //ifft change
madd.s f11, f6, f14 # f11 = bf3.im * w2->re - bf3.re * w2->im //ifft change
msub.s f10, f7, f14 # f10 = bf3.re * w2->re + bf3.im * w2->im //ifft change
addx4 a9, a6, a9 # w0 += w_step
addx8 a10, a6, a10 # w1 += 2 * w_step
addx4 a11, a6, a11
addx8 a11, a6, a11 # w2 += 3 * w_step
ee.stf.64.ip f5, f4, a13, 8 # *p1 = f4, *(p1 + 1) = f5, p1 += 2
ee.stf.64.ip f9, f8, a14, 8 # *p2 = f8, *(p2 + 1) = f9, p2 += 2
ee.stf.64.ip f11, f10, a15, 8 # *p3 = f10, *(p3 + 1) = f11, p3 += 2
.inv_bf4_loop_end:
addi.n a8, a8, 2 # j += 2
bgeu a8, a5, .ifft_stage_next # if j >= m
j .ifft_group
.ifft_stage_next:
slli a5, a5, 2 # m <<= 2
slli a6, a6, 2 # w_step <<= 2
addi.n a7, a7, -1 # log4N--
bnez a7, .ifft_stage # if log4N > 0
retw