add some code
This commit is contained in:
@@ -0,0 +1,180 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_fft2r_platform.h"
|
||||
#if (dsps_fft2r_sc16_ae32_enabled == 1)
|
||||
|
||||
// This is matrix multipliction function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_fft2r_sc16_ae32_
|
||||
.type dsps_fft2r_sc16_ae32_,@function
|
||||
|
||||
.global dsps_fft_w_table_sc16;
|
||||
|
||||
//The function implements the following C code:
|
||||
//esp_err_t dsps_fft2r_sc16_ansi(int16_t *data, int N)
|
||||
//{
|
||||
// esp_err_t result = ESP_OK;
|
||||
// uint32_t *w = (uint32_t*)dsps_fft_w_table_sc16;
|
||||
// uint32_t *in_data = (uint32_t *)data;
|
||||
// int ie, ia, m;
|
||||
// sc16_t temp;
|
||||
// sc16_t cs;// c - re, s - im
|
||||
// sc16_t m_data;
|
||||
// sc16_t a_data;
|
||||
|
||||
// ie = 1;
|
||||
// for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
|
||||
// ia = 0;
|
||||
// for (int j = 0; j < ie; j++) {
|
||||
// cs.data = w[j];
|
||||
// //c = w[2 * j];
|
||||
// //s = w[2 * j + 1];
|
||||
// for (int i = 0; i < N2; i++) {
|
||||
// m = ia + N2;
|
||||
// m_data.data = in_data[m];
|
||||
// a_data.data = in_data[ia];
|
||||
// sc16_t m1;
|
||||
// m1.re = xtfixed_bf_1(a_data.re, cs.re, m_data.re, cs.im, m_data.im, 16);//(a_data.re - temp.re + shift_const) >> 1;
|
||||
// m1.im = xtfixed_bf_2(a_data.im, cs.re, m_data.im, cs.im, m_data.re, 16);//(a_data.im - temp.im + shift_const) >> 1;
|
||||
// in_data[m] = m1.data;
|
||||
// sc16_t m2;
|
||||
// m2.re = xtfixed_bf_3(a_data.re, cs.re, m_data.re, cs.im, m_data.im, 16);//(a_data.re + temp.re + shift_const) >> 1;
|
||||
// m2.im = xtfixed_bf_4(a_data.im, cs.re, m_data.im, cs.im, m_data.re, 16);//(a_data.im + temp.im + shift_const)>>1;
|
||||
// in_data[ia] = m2.data;
|
||||
// ia++;
|
||||
// }
|
||||
// ia += N2;
|
||||
// }
|
||||
// ie <<= 1;
|
||||
// }
|
||||
// return result;
|
||||
// }
|
||||
|
||||
dsps_fft2r_sc16_ae32_:
|
||||
//esp_err_t dsps_fft2r_sc16_ansi(float *data, int N, float* dsps_fft_w_table_sc16)
|
||||
|
||||
entry a1, 16
|
||||
// Array increment for floating point data should be 4
|
||||
// data - a2
|
||||
// N - a3
|
||||
// dsps_fft_w_table_sc16 - a4 - for now
|
||||
|
||||
// a5 - 1, used to initialize acc
|
||||
// a6 - k, main loop counter; N2 - for (int N2 = N/2; N2 > 0; N2 >>= 1)
|
||||
// a7 - ie
|
||||
// a8 - j
|
||||
// a9 - test
|
||||
// a10 - (j)<<2, or a10 - j<<2
|
||||
// a11 - ia
|
||||
// a12 - m
|
||||
// a13 - ia pointer
|
||||
// a14 - m pointer
|
||||
// a15 - used to shift result
|
||||
|
||||
// This instruction are not working. Have to be fixed!!!
|
||||
// For now theres no solution...
|
||||
// l32r a4, dsps_fft_w_table_sc16_ae32
|
||||
// To use ldinc operation we have to prepare a4:
|
||||
addi a4, a4, -4
|
||||
addi a9, a2, -4 // prepare input pointer for ldinc operation
|
||||
|
||||
ldinc m1, a4 // Load [0x7fff j0] value to the m1
|
||||
addi a4, a4, -4
|
||||
|
||||
// a5 used to load 0x7fff and clear acch/l
|
||||
movi.n a5, 1 // a5 = 1;
|
||||
|
||||
srli a6, a3, 1 // a6 = N2 = N/2
|
||||
|
||||
// Load shift register
|
||||
movi a7, 16
|
||||
ssr a7
|
||||
|
||||
movi a7, 1 // a7 - ie
|
||||
|
||||
fft2r_l1:
|
||||
movi a8, 0 // a8 - j
|
||||
movi a11,0 // a11 = ia = 0;
|
||||
|
||||
fft2r_l2: // loop for j, a8 - j
|
||||
slli a10, a8, 2 // a10 = j<<2 (4 bytes per address) // shift for cs.data = w[j];
|
||||
add.n a10, a10, a4 // a10 - pointer to w tables
|
||||
ldinc m0, a10 // cs.data = w[j];
|
||||
// here we have m0 and m1
|
||||
|
||||
loopnez a6, fft2r_l3
|
||||
add.n a12, a11, a6 // a12 = m = ia + N2
|
||||
|
||||
slli a14, a12, 2 // a14 - pointer for m, m_data.data = in_data[m];
|
||||
slli a13, a11, 2 // a13 - pointer for ia, a_data.data = in_data[ia];
|
||||
add.n a14, a14, a9 // pointers to data arrays
|
||||
add.n a13, a13, a9 // These pointers are -4 from expected values...
|
||||
|
||||
ldinc m2, a14 // m_data, a14 += 4; The pointers ready to store data
|
||||
mul.da.ll m1, a5 // acc = 0x7fff*1
|
||||
ldinc m3, a13 // ai_data a13 += 4;
|
||||
// re - l, im - h
|
||||
muls.dd.ll m0, m2 // acc -= cs.re*m_data.re
|
||||
mula.dd.ll m1, m3 // acc += 0x7fff*a_data.re
|
||||
muls.dd.hh m0, m2 // acc -= cs.im*m_data.im
|
||||
// result in acclo in_data[m].re
|
||||
rsr a15, acclo
|
||||
mul.da.ll m1, a5 // acc = 0x7fff*1
|
||||
sra a15, a15
|
||||
muls.dd.lh m0, m2 // acc -= cs.re*m_data.im
|
||||
s16i a15, a14, 0
|
||||
mula.dd.lh m1, m3 // acc += 0x7fff*a_data.im
|
||||
mula.dd.hl m0, m2 // acc += cs.im*m_data.re
|
||||
// result in acclo in_data[m].im
|
||||
rsr a15, acclo
|
||||
mul.da.ll m1, a5 // acc = 0x7fff*1
|
||||
sra a15, a15
|
||||
mula.dd.ll m0, m2 // acc += cs.re*m_data.re
|
||||
s16i a15, a14, 2
|
||||
mula.dd.ll m1, m3 // acc += 0x7fff*a_data.re
|
||||
mula.dd.hh m0, m2 // acc += cs.im*m_data.im
|
||||
// result in acclo // in_data[ia].re
|
||||
rsr a15, acclo
|
||||
mul.da.ll m1, a5 // acc = 0x7fff*1
|
||||
|
||||
sra a15, a15
|
||||
mula.dd.lh m0, m2 // acc += cs.re*m_data.im
|
||||
s16i a15, a13, 0
|
||||
|
||||
mula.dd.lh m1, m3 // acc += 0x7fff*a_data.im
|
||||
muls.dd.hl m0, m2 // acc -= cs.im*m_data.re
|
||||
// result in acclo // in_data[ia].im
|
||||
rsr a15, acclo
|
||||
|
||||
sra a15, a15
|
||||
s16i a15, a13, 2
|
||||
|
||||
// Here we have m0 - w, m2 - m_data, m3 - ai_data,
|
||||
addi a11, a11, 1// ia++
|
||||
fft2r_l3:
|
||||
add a11, a11, a6
|
||||
|
||||
addi a8, a8, 1 // j++
|
||||
BNE a8, a7, fft2r_l2 //
|
||||
slli a7, a7, 1 // ie = ie<<1
|
||||
// main loop: for (int k = N/2; k > 0; k >>= 1)
|
||||
srli a6, a6, 1 // a6 = a6>>1
|
||||
BNEZ a6, fft2r_l1// Jump if > 0
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif // dsps_fft2r_sc16_ae32_enabled
|
||||
@@ -0,0 +1,169 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_fft2r_platform.h"
|
||||
#if (dsps_fft2r_sc16_aes3_enabled == 1)
|
||||
|
||||
// This is matrix multipliction function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.literal_position
|
||||
.literal .LC0_5_39, dsps_fft2r_sc16_initialized
|
||||
.literal .LC1_5_40, 32767
|
||||
.literal .LC2_5_41, 458756
|
||||
.literal .LC3_5_42, 458753
|
||||
|
||||
# Program Unit: dsps_fft2r_sc16_aes3_
|
||||
.type dsps_fft2r_sc16_aes3_, @function
|
||||
.align 4
|
||||
.global dsps_fft2r_sc16_aes3_
|
||||
dsps_fft2r_sc16_aes3_: # 0x4
|
||||
# q3 = 16
|
||||
# temp_round = 0
|
||||
# temp_round_ptr = 4
|
||||
.LBB1_dsps_fft2r_sc16_aes3_: # 0x4
|
||||
.frequency 1.000 0.000
|
||||
entry a1,64 #
|
||||
mov.n a10,a3 # [0]
|
||||
.type dsp_is_power_of_two, @function
|
||||
call8 dsp_is_power_of_two # [1] dsp_is_power_of_two
|
||||
|
||||
beqz.n a10,.LBB4_dsps_fft2r_sc16_aes3_ # [0]
|
||||
|
||||
l32r a8,.LC0_5_39 # [0]
|
||||
l8ui a8,a8,0 # [2] id:207 dsps_fft2r_sc16_initialized+0x0
|
||||
beqz.n a8,.LBB6_dsps_fft2r_sc16_aes3_ # [4]
|
||||
|
||||
mov.n a9,a1 # [0]
|
||||
l32r a8,.LC1_5_40 # [1]
|
||||
addi.n a6,a3,1 # [2]
|
||||
movi.n a10,16 # [3]
|
||||
wsr.sar a10 # [4]
|
||||
movgez a6,a3,a3 # [5]
|
||||
s16i a8,a1,0 # [6] temp_round
|
||||
ee.vldbc.16.ip q6,a9,0 # [7] id:209
|
||||
srai a6,a6,1 # [8]
|
||||
bltui a6,3,.Lt_0_13826 # [9]
|
||||
|
||||
movi.n a5,1 # [0]
|
||||
|
||||
.Lt_0_9218: # 0x33
|
||||
mov.n a13,a4 # [0]
|
||||
mov.n a9,a2 # [1]
|
||||
beqz.n a5,.Lt_0_9474 # [2]
|
||||
|
||||
srli a3,a6,2 # [0]
|
||||
movi.n a14,0 # [1]
|
||||
slli a15,a6,2 # [2]
|
||||
add.n a8,a15,a9 # [3]
|
||||
|
||||
.Lt_0_9986: # 0x43
|
||||
ee.vldbc.32.ip q5,a13,4 # [0] id:215
|
||||
loopnez a3,.LBB54_dsps_fft2r_sc16_aes3_ # [1]
|
||||
|
||||
.LBB52_dsps_fft2r_sc16_aes3_: # 0x49
|
||||
ee.vld.128.ip q0,a8,0 # [0*II+0] id:217
|
||||
ee.vld.128.ip q2,a9,0 # [0*II+1] id:216
|
||||
ee.cmul.s16 q1,q5,q0,2 # [0*II+2]
|
||||
ee.vmul.s16 q2,q2,q6 # [0*II+3]
|
||||
ee.cmul.s16 q1,q5,q0,3 # [0*II+4]
|
||||
ee.vsubs.s16 q3,q2,q1 # [0*II+6]
|
||||
ee.vadds.s16.st.incp q3,a8,q3,q2,q1 # [0*II+7] id:221
|
||||
ee.vst.128.ip q3,a9,16 # [0*II+8] id:222
|
||||
|
||||
.LBB54_dsps_fft2r_sc16_aes3_: # 0x62
|
||||
addi.n a14,a14,1 # [0]
|
||||
add.n a9,a9,a15 # [1]
|
||||
add.n a8,a15,a9 # [2]
|
||||
bne a14,a5,.Lt_0_9986 # [3]
|
||||
|
||||
.Lt_0_9474: # 0x6b
|
||||
slli a5,a5,1 # [0]
|
||||
srli a6,a6,1 # [1]
|
||||
bgeui a6,3,.Lt_0_9218 # [2]
|
||||
|
||||
srli a10,a5,1 # [0]
|
||||
beqz.n a10,.Lt_0_14594 # [1]
|
||||
|
||||
mov.n a9,a4 # [0]
|
||||
mv.qr q4,q1 # [1]
|
||||
mov.n a8,a2 # [2]
|
||||
mv.qr q1,q0 # [3]
|
||||
mv.qr q0,q2 # [4]
|
||||
loopnez a10,.LBB76_dsps_fft2r_sc16_aes3_ # [5]
|
||||
|
||||
.LBB74_dsps_fft2r_sc16_aes3_: # 0x89
|
||||
ee.vld.l.64.ip q0,a8,8 # [0*II+0] id:225
|
||||
ee.vldbc.32.ip q2,a9,4 # [0*II+1] id:223
|
||||
ee.vld.l.64.ip q1,a8,8 # [0*II+2] id:226
|
||||
ee.vldbc.32.ip q3,a9,4 # [0*II+3] id:224
|
||||
ee.vld.h.64.ip q0,a8,8 # [0*II+4] id:227
|
||||
ee.vunzip.32 q2,q3 # [0*II+5]
|
||||
ee.vld.h.64.ip q1,a8,-24 # [0*II+6] id:228
|
||||
ee.vmul.s16 q0,q0,q6 # [0*II+7]
|
||||
ee.cmul.s16 q4,q2,q1,2 # [0*II+8]
|
||||
ee.cmul.s16 q4,q2,q1,3 # [0*II+9]
|
||||
ee.vadds.s16 q2,q0,q4 # [0*II+11]
|
||||
ee.vsubs.s16 q3,q0,q4 # [0*II+12]
|
||||
ee.vst.l.64.ip q2,a8,8 # [0*II+13] id:232
|
||||
ee.vst.l.64.ip q3,a8,8 # [0*II+14] id:233
|
||||
ee.vst.h.64.ip q2,a8,8 # [0*II+15] id:234
|
||||
ee.vst.h.64.ip q3,a8,8 # [0*II+16] id:235
|
||||
|
||||
.LBB76_dsps_fft2r_sc16_aes3_: # 0xb9
|
||||
.frequency 0.608 0.000
|
||||
st.qr q4,a1,16 # [0] q3
|
||||
|
||||
.Lt_0_11778: # 0xbc
|
||||
ld.qr q3,a1,16 # [0] q3
|
||||
slli a10,a5,1 # [1]
|
||||
srli a10,a10,2 # [2]
|
||||
loopnez a10,.LBB98_dsps_fft2r_sc16_aes3_ # [3]
|
||||
|
||||
.LBB96_dsps_fft2r_sc16_aes3_: # 0xc8
|
||||
ee.vld.128.ip q0,a2,16 # [0*II+0] id:237
|
||||
ee.vld.128.ip q1,a2,-16 # [0*II+1] id:238
|
||||
ee.vld.128.ip q2,a4,16 # [0*II+2] id:236
|
||||
ee.vunzip.32 q0,q1 # [0*II+3]
|
||||
ee.cmul.s16 q3,q2,q1,2 # [0*II+4]
|
||||
ee.vmul.s16 q0,q0,q6 # [0*II+5]
|
||||
ee.cmul.s16 q3,q2,q1,3 # [0*II+6]
|
||||
ee.vsubs.s16 q1,q0,q3 # [0*II+8]
|
||||
ee.vadds.s16 q0,q0,q3 # [0*II+9]
|
||||
ee.vzip.32 q0,q1 # [0*II+10]
|
||||
ee.vst.128.ip q0,a2,16 # [0*II+11] id:242
|
||||
ee.vst.128.ip q1,a2,16 # [0*II+12] id:243
|
||||
|
||||
.LBB98_dsps_fft2r_sc16_aes3_: # 0xec
|
||||
movi.n a2,0 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_13826: # 0xf0
|
||||
movi.n a5,1 # [0]
|
||||
j .Lt_0_11778 # [1]
|
||||
|
||||
.LBB6_dsps_fft2r_sc16_aes3_: # 0xf5
|
||||
l32r a2,.LC2_5_41 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB4_dsps_fft2r_sc16_aes3_: # 0xfa
|
||||
l32r a2,.LC3_5_42 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_14594: # 0xff
|
||||
st.qr q1,a1,16 # [0] q3
|
||||
j .Lt_0_11778 # [1]
|
||||
|
||||
|
||||
#endif // dsps_fft2r_sc16_ae32_enabled
|
||||
@@ -0,0 +1,315 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_fft2r.h"
|
||||
#include "dsp_common.h"
|
||||
#include "dsp_types.h"
|
||||
#include <math.h>
|
||||
#include "esp_attr.h"
|
||||
#include <malloc.h>
|
||||
#include "dsp_tests.h"
|
||||
|
||||
|
||||
int16_t *dsps_fft_w_table_sc16;
|
||||
int dsps_fft_w_table_sc16_size;
|
||||
uint8_t dsps_fft2r_sc16_initialized = 0;
|
||||
uint8_t dsps_fft2r_sc16_mem_allocated = 0;
|
||||
|
||||
unsigned short reverse(unsigned short x, unsigned short N, int order);
|
||||
|
||||
static const int add_rount_mult = 0x7fff;
|
||||
static const int mult_shift_const = 0x7fff; // Used to shift data << 15
|
||||
|
||||
static inline int16_t xtfixed_bf_1(int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift)
|
||||
{
|
||||
int result = a0 * mult_shift_const;
|
||||
result -= (int32_t)a1 * (int32_t)a2 + (int32_t)a3 * (int32_t)a4;
|
||||
result += add_rount_mult;
|
||||
result = result >> result_shift;
|
||||
return (int16_t)result;
|
||||
}
|
||||
|
||||
static inline int16_t xtfixed_bf_2(int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift)
|
||||
{
|
||||
int result = a0 * mult_shift_const;
|
||||
result -= ((int32_t)a1 * (int32_t)a2 - (int32_t)a3 * (int32_t)a4);
|
||||
result += add_rount_mult;
|
||||
result = result >> result_shift;
|
||||
return (int16_t)result;
|
||||
}
|
||||
|
||||
static inline int16_t xtfixed_bf_3(int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift)
|
||||
{
|
||||
int result = a0 * mult_shift_const;
|
||||
result += (int32_t)a1 * (int32_t)a2 + (int32_t)a3 * (int32_t)a4;
|
||||
result += add_rount_mult;
|
||||
result = result >> result_shift;
|
||||
return (int16_t)result;
|
||||
}
|
||||
|
||||
static inline int16_t xtfixed_bf_4(int16_t a0, int16_t a1, int16_t a2, int16_t a3, int16_t a4, int result_shift)
|
||||
{
|
||||
int result = a0 * mult_shift_const;
|
||||
result += (int32_t)a1 * (int32_t)a2 - (int32_t)a3 * (int32_t)a4;
|
||||
result += add_rount_mult;
|
||||
result = result >> result_shift;
|
||||
return (int16_t)result;
|
||||
}
|
||||
|
||||
esp_err_t dsps_fft2r_init_sc16(int16_t *fft_table_buff, int table_size)
|
||||
{
|
||||
esp_err_t result = ESP_OK;
|
||||
if (dsps_fft2r_sc16_initialized != 0) {
|
||||
return result;
|
||||
}
|
||||
if (table_size > CONFIG_DSP_MAX_FFT_SIZE) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (table_size == 0) {
|
||||
return result;
|
||||
}
|
||||
if (fft_table_buff != NULL) {
|
||||
if (dsps_fft2r_sc16_mem_allocated) {
|
||||
return ESP_ERR_DSP_REINITIALIZED;
|
||||
}
|
||||
dsps_fft_w_table_sc16 = fft_table_buff;
|
||||
dsps_fft_w_table_sc16_size = table_size;
|
||||
} else {
|
||||
if (!dsps_fft2r_sc16_mem_allocated) {
|
||||
dsps_fft_w_table_sc16 = (int16_t *)memalign(16, CONFIG_DSP_MAX_FFT_SIZE * sizeof(int16_t));
|
||||
}
|
||||
dsps_fft_w_table_sc16_size = CONFIG_DSP_MAX_FFT_SIZE;
|
||||
dsps_fft2r_sc16_mem_allocated = 1;
|
||||
}
|
||||
|
||||
result = dsps_gen_w_r2_sc16(dsps_fft_w_table_sc16, dsps_fft_w_table_sc16_size);
|
||||
if (result != ESP_OK) {
|
||||
return result;
|
||||
}
|
||||
result = dsps_bit_rev_sc16_ansi(dsps_fft_w_table_sc16, dsps_fft_w_table_sc16_size >> 1);
|
||||
if (result != ESP_OK) {
|
||||
return result;
|
||||
}
|
||||
dsps_fft2r_sc16_initialized = 1;
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
void dsps_fft2r_deinit_sc16()
|
||||
{
|
||||
if (dsps_fft2r_sc16_mem_allocated) {
|
||||
free(dsps_fft_w_table_sc16);
|
||||
}
|
||||
dsps_fft2r_sc16_mem_allocated = 0;
|
||||
dsps_fft2r_sc16_initialized = 0;
|
||||
}
|
||||
|
||||
esp_err_t dsps_fft2r_sc16_ansi_(int16_t *data, int N, int16_t *sc_table)
|
||||
{
|
||||
if (!dsp_is_power_of_two(N)) {
|
||||
return ESP_ERR_DSP_INVALID_LENGTH;
|
||||
}
|
||||
if (!dsps_fft2r_sc16_initialized) {
|
||||
return ESP_ERR_DSP_UNINITIALIZED;
|
||||
}
|
||||
|
||||
esp_err_t result = ESP_OK;
|
||||
|
||||
uint32_t *w = (uint32_t *)sc_table;
|
||||
uint32_t *in_data = (uint32_t *)data;
|
||||
|
||||
int ie, ia, m;
|
||||
sc16_t cs;// c - re, s - im
|
||||
sc16_t m_data;
|
||||
sc16_t a_data;
|
||||
|
||||
ie = 1;
|
||||
for (int N2 = N / 2; N2 > 0; N2 >>= 1) {
|
||||
ia = 0;
|
||||
for (int j = 0; j < ie; j++) {
|
||||
cs.data = w[j];
|
||||
//c = w[2 * j];
|
||||
//s = w[2 * j + 1];
|
||||
for (int i = 0; i < N2; i++) {
|
||||
m = ia + N2;
|
||||
m_data.data = in_data[m];
|
||||
a_data.data = in_data[ia];
|
||||
//data[2 * m] = data[2 * ia] - re_temp;
|
||||
//data[2 * m + 1] = data[2 * ia + 1] - im_temp;
|
||||
sc16_t m1;
|
||||
m1.re = xtfixed_bf_1(a_data.re, cs.re, m_data.re, cs.im, m_data.im, 16);//(a_data.re - temp.re + shift_const) >> 1;
|
||||
m1.im = xtfixed_bf_2(a_data.im, cs.re, m_data.im, cs.im, m_data.re, 16);//(a_data.im - temp.im + shift_const) >> 1;
|
||||
in_data[m] = m1.data;
|
||||
|
||||
//data[2 * ia] = data[2 * ia] + re_temp;
|
||||
//data[2 * ia + 1] = data[2 * ia + 1] + im_temp;
|
||||
sc16_t m2;
|
||||
m2.re = xtfixed_bf_3(a_data.re, cs.re, m_data.re, cs.im, m_data.im, 16);//(a_data.re + temp.re + shift_const) >> 1;
|
||||
m2.im = xtfixed_bf_4(a_data.im, cs.re, m_data.im, cs.im, m_data.re, 16);//(a_data.im + temp.im + shift_const)>>1;
|
||||
in_data[ia] = m2.data;
|
||||
ia++;
|
||||
}
|
||||
ia += N2;
|
||||
}
|
||||
ie <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static inline unsigned short reverse_sc16(unsigned short x, unsigned short N, int order)
|
||||
{
|
||||
unsigned short b = x;
|
||||
|
||||
b = (b & 0xff00) >> 8 | (b & 0x00fF) << 8;
|
||||
b = (b & 0xf0F0) >> 4 | (b & 0x0f0F) << 4;
|
||||
b = (b & 0xCCCC) >> 2 | (b & 0x3333) << 2;
|
||||
b = (b & 0xAAAA) >> 1 | (b & 0x5555) << 1;
|
||||
return b >> (16 - order);
|
||||
}
|
||||
|
||||
esp_err_t dsps_bit_rev_sc16_ansi(int16_t *data, int N)
|
||||
{
|
||||
if (!dsp_is_power_of_two(N)) {
|
||||
return ESP_ERR_DSP_INVALID_LENGTH;
|
||||
}
|
||||
esp_err_t result = ESP_OK;
|
||||
|
||||
int j, k;
|
||||
uint32_t temp;
|
||||
uint32_t *in_data = (uint32_t *)data;
|
||||
j = 0;
|
||||
for (int i = 1; i < (N - 1); i++) {
|
||||
k = N >> 1;
|
||||
while (k <= j) {
|
||||
j -= k;
|
||||
k >>= 1;
|
||||
}
|
||||
j += k;
|
||||
if (i < j) {
|
||||
temp = in_data[j];
|
||||
in_data[j] = in_data[i];
|
||||
in_data[i] = temp;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
esp_err_t dsps_gen_w_r2_sc16(int16_t *w, int N)
|
||||
{
|
||||
if (!dsp_is_power_of_two(N)) {
|
||||
return ESP_ERR_DSP_INVALID_LENGTH;
|
||||
}
|
||||
|
||||
esp_err_t result = ESP_OK;
|
||||
|
||||
int i;
|
||||
float e = M_PI * 2.0 / N;
|
||||
|
||||
for (i = 0; i < (N >> 1); i++) {
|
||||
w[2 * i] = (int16_t)(INT16_MAX * cosf(i * e));
|
||||
w[2 * i + 1] = (int16_t)(INT16_MAX * sinf(i * e));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
esp_err_t dsps_cplx2reC_sc16(int16_t *data, int N)
|
||||
{
|
||||
if (!dsp_is_power_of_two(N)) {
|
||||
return ESP_ERR_DSP_INVALID_LENGTH;
|
||||
}
|
||||
esp_err_t result = ESP_OK;
|
||||
|
||||
int i;
|
||||
int n2 = N << (1); // we will operate with int32 indexes
|
||||
uint32_t *in_data = (uint32_t *)data;
|
||||
|
||||
sc16_t kl;
|
||||
sc16_t kh;
|
||||
sc16_t nl;
|
||||
sc16_t nh;
|
||||
|
||||
for (i = 0; i < (N / 4); i++) {
|
||||
kl.data = in_data[i + 1];
|
||||
nl.data = in_data[N - i - 1];
|
||||
kh.data = in_data[i + 1 + N / 2];
|
||||
nh.data = in_data[N - i - 1 - N / 2];
|
||||
|
||||
data[i * 2 + 0 + 2] = kl.re + nl.re;
|
||||
data[i * 2 + 1 + 2] = kl.im - nl.im;
|
||||
|
||||
data[n2 - i * 2 - 1 - N] = kh.re + nh.re;
|
||||
data[n2 - i * 2 - 2 - N] = kh.im - nh.im;
|
||||
|
||||
data[i * 2 + 0 + 2 + N] = kl.im + nl.im;
|
||||
data[i * 2 + 1 + 2 + N] = kl.re - nl.re;
|
||||
|
||||
data[n2 - i * 2 - 1] = kh.im + nh.im;
|
||||
data[n2 - i * 2 - 2] = kh.re - nh.re;
|
||||
}
|
||||
data[N] = data[1];
|
||||
data[1] = 0;
|
||||
data[N + 1] = 0;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
esp_err_t dsps_cplx2real_sc16_ansi(int16_t *data, int N)
|
||||
{
|
||||
|
||||
int order = dsp_power_of_two(N);
|
||||
sc16_t *table = (sc16_t *)dsps_fft_w_table_sc16;
|
||||
sc16_t *result = (sc16_t *)data;
|
||||
// Original formula...
|
||||
// result[0].re = result[0].re + result[0].im;
|
||||
// result[N].re = result[0].re - result[0].im;
|
||||
// result[0].im = 0;
|
||||
// result[N].im = 0;
|
||||
// Optimized one:
|
||||
int16_t tmp_re = result[0].re;
|
||||
result[0].re = (tmp_re + result[0].im) >> 1;
|
||||
result[0].im = (tmp_re - result[0].im) >> 1;
|
||||
|
||||
sc16_t f1k, f2k;
|
||||
for (int k = 1; k <= N / 2 ; ++k ) {
|
||||
sc16_t fpk = result[k];
|
||||
sc16_t fpnk;
|
||||
fpnk.re = result[N - k].re;
|
||||
fpnk.im = result[N - k].im;
|
||||
f1k.re = fpk.re + fpnk.re;
|
||||
f1k.im = fpk.im - fpnk.im;
|
||||
f2k.re = fpk.re - fpnk.re;
|
||||
f2k.im = fpk.im + fpnk.im;
|
||||
|
||||
int table_index = reverse(k, N, order);
|
||||
|
||||
// float c = -dsps_fft_w_table_fc32[table_index*2+1];
|
||||
// float s = -dsps_fft_w_table_fc32[table_index*2+0];
|
||||
sc16_t w = table[table_index];
|
||||
|
||||
sc16_t tw;
|
||||
{
|
||||
int re = (w.re * f2k.im - w.im * f2k.re) >> 15;
|
||||
int im = (+w.re * f2k.re + w.im * f2k.im) >> 15;
|
||||
tw.re = re;
|
||||
tw.im = im;
|
||||
}
|
||||
|
||||
result[k].re = (f1k.re + tw.re) >> 2;
|
||||
result[k].im = (f1k.im - tw.im) >> 2;
|
||||
result[N - k].re = (f1k.re - tw.re) >> 2;
|
||||
result[N - k].im = -(f1k.im + tw.im) >> 2;
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_fft2r_platform.h"
|
||||
|
||||
#if (dsps_fft2r_sc16_arp4_enabled == 1)
|
||||
|
||||
// This is matrix multiplication function for esp32p4 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_fft2r_sc16_arp4_
|
||||
.type dsps_fft2r_sc16_arp4_,@function
|
||||
|
||||
dsps_fft2r_sc16_arp4_:
|
||||
//esp_err_t dsps_fft2r_sc16_arp4_(int16_t *data, int N, int16_t *w);
|
||||
|
||||
add sp,sp,-16
|
||||
sw s8, 4(sp)
|
||||
sw s9, 8(sp)
|
||||
sw s10, 12(sp)
|
||||
|
||||
mv a5, a3
|
||||
li a4, 16
|
||||
esp.movx.w.sar a4
|
||||
|
||||
li a4, 0x7fff
|
||||
sw a4, 0(sp)
|
||||
mv x26, sp
|
||||
esp.vldbc.16.ip q6, x26, 0
|
||||
#
|
||||
srli t6, a1, 1 // t6 = N2 = N/2
|
||||
li t0, 1 // t0 - ie
|
||||
li t2, 2 // t2 = 2 : limit for the loop N2 > 2
|
||||
|
||||
.fft2r_l1:
|
||||
li t1, 0 // t1 - j
|
||||
li t4, 0 // t4 = ia = 0;
|
||||
mv x26, a2 // x26 - pointer to w
|
||||
.fft2r_l2: // loop for j, a8 - j
|
||||
|
||||
esp.vldbc.32.ip q5, x26, 4
|
||||
add t5, t4, t6 // t5 = m = ia + N2
|
||||
|
||||
slli a4, t5, 2 // a4 - pointer for m
|
||||
slli a3, t4, 2 // a3 - pointer for ia
|
||||
add a4, a4, a0 // a4 = &data[m*2]
|
||||
add a3, a3, a0 // a3 = &data[ia*2]
|
||||
mv x24, a3
|
||||
mv x25, a4
|
||||
add t4, t4, t6 // ia += N2 instead of ia++ for each cycle
|
||||
srli a7, t6, 2 // a7 = t6>> 2
|
||||
beqz a7, .fft2r_l3_skeep
|
||||
|
||||
esp.lp.setup 0, a7, .fft2r_l3 // main butterfly loop
|
||||
esp.vld.128.ip q0, x25, 0 // Load data[m .. m + 3]
|
||||
esp.vld.128.ip q2, x24, 0 // Load data[ia .. ia + 3]
|
||||
esp.cmul.s16 q1, q5, q0, 2
|
||||
esp.vmul.s16 q2, q2, q6 // q0 = in_data_ia*0x7fff
|
||||
esp.cmul.s16 q1, q5, q0, 3
|
||||
esp.vsub.s16 q3, q2, q1 // input[2 * m] = input[2 * ia] - re_temp;
|
||||
|
||||
esp.vadd.s16.st.incp q3, x25, q4, q2, q1
|
||||
.fft2r_l3: esp.vst.128.ip q4, x24, 16
|
||||
|
||||
.fft2r_l3_skeep:
|
||||
add t4, t4, t6 // ia += N2
|
||||
add t1, t1, 1 // j++
|
||||
BNE t1, t0, .fft2r_l2
|
||||
|
||||
slli t0, t0, 1 // ie = ie<<1
|
||||
srli t6, t6, 1 // t6 = N2 = N2>>1
|
||||
bgt t6, t2, .fft2r_l1// N2 > 2
|
||||
|
||||
srli t0, t0, 1 // ie = ie>>1
|
||||
mv x26, a2 // x26 - pointer to w
|
||||
mv x24, a0
|
||||
esp.lp.setup 0, t0, .fft2r_l2_1
|
||||
esp.vldbc.32.ip q2, x26, 4
|
||||
esp.vldbc.32.ip q7, x26, 4
|
||||
esp.vunzip.32 q2, q7
|
||||
|
||||
esp.vld.l.64.ip q0, x24, 8
|
||||
esp.vld.l.64.ip q1, x24, 8
|
||||
esp.vld.h.64.ip q0, x24, 8
|
||||
esp.vld.h.64.ip q1, x24, -24
|
||||
|
||||
esp.vmul.s16 q0, q0, q6
|
||||
esp.cmul.s16 q3, q2, q1, 2
|
||||
esp.cmul.s16 q3, q2, q1, 3
|
||||
esp.vsub.s16 q4, q0, q3
|
||||
esp.vadd.s16 q5, q0, q3
|
||||
|
||||
esp.vst.l.64.ip q5, x24, 8
|
||||
esp.vst.l.64.ip q4, x24, 8
|
||||
esp.vst.h.64.ip q5, x24, 8
|
||||
.fft2r_l2_1: esp.vst.h.64.ip q4, x24, 8
|
||||
|
||||
mv x26, a2 // x26 - pointer to w
|
||||
mv x24, a0
|
||||
esp.lp.setup 0, t0, .fft2r_l2_0
|
||||
esp.vld.128.ip q0, x24, 16 // q0 = ia
|
||||
esp.vld.128.ip q1, x24,-16 // q1 = m
|
||||
esp.vld.128.ip q2, x26, 16
|
||||
|
||||
esp.vunzip.32 q0, q1
|
||||
|
||||
esp.cmul.s16 q3, q2, q1, 2
|
||||
esp.vmul.s16 q0, q0, q6
|
||||
esp.cmul.s16 q3, q2, q1, 3
|
||||
|
||||
esp.vsub.s16 q1, q0, q3
|
||||
esp.vadd.s16 q0, q0, q3
|
||||
|
||||
esp.vzip.32 q0, q1
|
||||
|
||||
esp.vst.128.ip q0, x24, 16
|
||||
.fft2r_l2_0: esp.vst.128.ip q1, x24, 16
|
||||
|
||||
lw s8, 4(sp)
|
||||
lw s9, 8(sp)
|
||||
lw s10, 12(sp)
|
||||
add sp,sp,16
|
||||
li a0,0
|
||||
ret
|
||||
|
||||
#endif // dsps_fft2r_sc16_arp4_enabled
|
||||
Reference in New Issue
Block a user