add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,153 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This is matrix multipliction function for esp32p4 processor.
.text
.align 4
.global dl_fft2r_fc32_arp4_
.type dl_fft2r_fc32_arp4_,@function
dl_fft2r_fc32_arp4_:
//esp_err_t dl_fft2r_fc32_arp4_(float *data, int N, float* dl_fft_w_table_fc32)
add sp,sp,-16
#
srli t6, a1, 1 // a6 = N2 = N/2
li t0, 1 // a7 - ie
.fft2r_l1:
li t1, 0 // a8 - j
li t4, 0 // a11 = ia = 0;
.fft2r_l2: // loop for j, a8 - j
slli t3, t1, 3 // a10 = j<<3 // shift for cos () -- c = w[2 * j];
add t3, t3, a2 // a10 - pointer to cos
flw fa0, 0(t3)
flw fa1, 4(t3)
esp.lp.setup 0, t6, .fft2r_l3 // .fft2r_l3 - label to the last executed instruction
add t5, t4, t6 // a12 = m = ia + N2
slli a4, t5, 3 // a14 - pointer for m*2
slli a3, t4, 3 // a13 - pointer for ia*2
add a4, a4, a0 // pointers to data arrays
add a3, a3, a0 //
flw fa4, 0(a4)
flw fa5, 4(a4)
flw fa2, 0(a3)
flw fa3, 4(a3)
fmul.s ft6, fa0, fa4 // re_temp = c * data[2 * m]
fmul.s ft7, fa0, fa5 // im_temp = c * data[2 * m + 1]
fmadd.s ft6, fa1, fa5, ft6 // re_temp += s * data[2 * m + 1];
fnmsub.s ft7, fa1, fa4, ft7 // im_temp -= s * data[2 * m];
fsub.s ft8, fa2, ft6 // = data[2 * ia] - re_temp;
fsub.s ft9, fa3, ft7 // = data[2 * ia + 1] - im_temp;
fadd.s ft10, fa2, ft6 // = data[2 * ia] + re_temp;
fadd.s ft11, fa3, ft7 // = data[2 * ia + 1] + im_temp;
fsw ft8, 0(a4)
fsw ft9, 4(a4)
fsw ft10, 0(a3)
fsw ft11, 4(a3)
.fft2r_l3: add t4, t4, 1 // ia++
add t4, t4, t6
add t1, t1, 1 // j++
BNE t1, t0, .fft2r_l2
slli t0, t0, 1 // ie = ie<<1
srli t6, t6, 1 // a6 = a6>>1
BNEZ t6, .fft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret
// This is matrix multipliction function for esp32p4 processor.
.text
.align 4
.global dl_ifft2r_fc32_arp4_
.type dl_ifft2r_fc32_arp4_,@function
dl_ifft2r_fc32_arp4_:
//esp_err_t dl_ifft2r_fc32_arp4_(float *data, int N, float* dl_fft_w_table_fc32)
add sp,sp,-16
#
srli t6, a1, 1 // a6 = N2 = N/2
li t0, 1 // a7 - ie
.ifft2r_l1:
li t1, 0 // a8 - j
li t4, 0 // a11 = ia = 0;
.ifft2r_l2: // loop for j, a8 - j
slli t3, t1, 3 // a10 = j<<3 // shift for cos () -- c = w[2 * j];
add t3, t3, a2 // a10 - pointer to cos
flw fa0, 0(t3)
flw fa1, 4(t3)
// CHANGE: Negate the imaginary part of twiddle factors (complex conjugate)
fneg.s fa1, fa1 // s = -s (since w^-1 = w*)
esp.lp.setup 0, t6, .ifft2r_l3 // .fft2r_l3 - label to the last executed instruction
add t5, t4, t6 // a12 = m = ia + N2
slli a4, t5, 3 // a14 - pointer for m*2
slli a3, t4, 3 // a13 - pointer for ia*2
add a4, a4, a0 // pointers to data arrays
add a3, a3, a0 //
flw fa4, 0(a4)
flw fa5, 4(a4)
flw fa2, 0(a3)
flw fa3, 4(a3)
fmul.s ft6, fa0, fa4 // re_temp = c * data[2 * m]
fmul.s ft7, fa0, fa5 // im_temp = c * data[2 * m + 1]
fmadd.s ft6, fa1, fa5, ft6 // re_temp += s * data[2 * m + 1];
fnmsub.s ft7, fa1, fa4, ft7 // im_temp -= s * data[2 * m];
fsub.s ft8, fa2, ft6 // = data[2 * ia] - re_temp;
fsub.s ft9, fa3, ft7 // = data[2 * ia + 1] - im_temp;
fadd.s ft10, fa2, ft6 // = data[2 * ia] + re_temp;
fadd.s ft11, fa3, ft7 // = data[2 * ia + 1] + im_temp;
fsw ft8, 0(a4)
fsw ft9, 4(a4)
fsw ft10, 0(a3)
fsw ft11, 4(a3)
.ifft2r_l3: add t4, t4, 1 // ia++
add t4, t4, t6
add t1, t1, 1 // j++
BNE t1, t0, .ifft2r_l2
slli t0, t0, 1 // ie = ie<<1
srli t6, t6, 1 // a6 = a6>>1
BNEZ t6, .ifft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret

View File

@@ -0,0 +1,304 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
.align 4
.global dl_fft4r_fc32_arp4_
.type dl_fft4r_fc32_arp4_,@function
dl_fft4r_fc32_arp4_:
//esp_err_t dl_fft4r_fc32_arp4_(float *data, int N, float *table, int table_size)
// table_size - a3
// m - t0
// j - t1
add sp,sp,-16
#
srli t6, a1, 1 // t6 = log4N = N/2
li t0, 2 // t0 - m
div a3, a3, a1 // wind_step = table_size / N
slli a3, a3, 3 // wind_step = complex step = 8 bytes
.fft2r_l1:
li t1, 0 // t1 - j
srli a1, a1, 2 // a1 = length = length >> 2;
.fft2r_l2: // loop for j, t1 - j
slli t2, a1, 4 // t2 = length << 1 << 3 (8 bytes for one complex sample)
slli t3, a1, 3 // t2 = length << 1 << 3 (8 bytes for one complex sample)
// start_index = j * (length << 1); // n: n-point FFT
mul t2,t2,t1
add a4, a0, t2 // fc32_t *ptrc0
add a5, a4, t3 // fc32_t *ptrc1
add a6, a5, t3 // fc32_t *ptrc2
add a7, a6, t3 // fc32_t *ptrc3
# flw fa0, 0(a4)
# fsw fa0, 0(t3)
# add t3, t3, 4
mv t2, a2 // winc0
mv t3, a2 // winc0
mv t4, a2 // winc0
esp.lp.setup 0, a1, .fft2r_l3 // .fft2r_l3 - label to the last executed instruction
flw fa0, 0(a4) // in0.re
flw fa4, 0(a6) // in2.re
fadd.s ft0, fa0, fa4 // in0.re + in2.re
flw fa1, 4(a4) // in0.im
fsub.s ft1, fa0, fa4 // in0.re - in2.re
flw fa5, 4(a6) // in2.im
fadd.s ft2, fa1, fa5 // in0.im + in2.im
flw fa2, 0(a5) // in1.re
fsub.s ft3, fa1, fa5 // in0.im - in2.im
flw fa6, 0(a7) // in3.re
fadd.s ft4, fa2, fa6 // in1.re + in3.re
flw fa3, 4(a5) // in1.im
fsub.s ft5, fa2, fa6 // in1.re - in3.re
flw fa7, 4(a7) // in3.im
fadd.s ft6, fa3, fa7 // in1.im + in3.im
fsub.s ft7, fa3, fa7 // in1.im - in3.im
# bfly[0].re = ft0 + ft4;
fadd.s fa0, ft0, ft4;
# bfly[0].im = ft2 + ft6;
fadd.s fa1, ft2, ft6;
# bfly[1].re = ft1 + ft7;
fadd.s fa2, ft1, ft7;
# bfly[1].im = ft3 - ft5;
fsub.s fa3, ft3, ft5;
# bfly[2].re = ft0 - ft5;
fsub.s fa4, ft0, ft4;
flw ft0, 0(t2) // winc0->re
# bfly[2].im = ft2 - ft7;
fsub.s fa5, ft2, ft6;
flw ft2, 0(t3) // winc1->re
# bfly[3].re = ft1 - ft6;
fsub.s fa6, ft1, ft7;
flw ft1, 4(t2) // winc0->im
# bfly[3].im = ft3 + ft5;
fadd.s fa7, ft3, ft5;
// *ptrc0 = bfly[0];
fsw fa0, 0(a4) // in0.re
fsw fa1, 4(a4) // in0.im
flw ft3, 4(t3) // winc1->im
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
fmul.s fa0, fa2, ft0
add t2, t2, a3 // winc0 += 1 * wind_step;
fmul.s fa1, fa3, ft0
fmul.s ft0, fa4, ft2
fmul.s ft2, fa5, ft2
flw ft4, 0(t4) // winc2->re
flw ft5, 4(t4) // winc3->im
fmadd.s fa0, fa3, ft1, fa0
add t3, t3, a3 // winc1 += 2 * wind_step;
fnmsub.s fa1, fa2, ft1, fa1
add t3, t3, a3 //
fmul.s fa2, fa6, ft4
fmul.s fa3, fa7, ft4
add t4, t4, a3 // winc2 += 3 * wind_step;
fmadd.s ft0, fa5, ft3, ft0
add t4, t4, a3 //
fnmsub.s ft2, fa4, ft3, ft2
fmadd.s ft3, fa7, ft5, fa2
add t4, t4, a3 //
fnmsub.s fa3, fa6, ft5, fa3
fsw fa0, 0(a5) // in1.re
add a4, a4, 8
fsw fa1, 4(a5) // in1.im
add a5, a5, 8
fsw ft0, 0(a6) // in2.re
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
fsw ft2, 4(a6) // in2.re
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
add a6, a6, 8
fsw ft3, 0(a7) // in2.re
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
fsw fa3, 4(a7) // in2.re
add a7, a7, 8
// Temp solution
.fft2r_l3: nop
add t1, t1, 2 // j+=2
BNE t1, t0, .fft2r_l2
slli t0, t0, 2 // t0 = m = m<<2
srli t6, t6, 2 // t6 = log4N >>= 2
slli a3, a3, 2 // wind_step = wind_step << 2;
BNEZ t6, .fft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret
.text
.align 4
.global dl_ifft4r_fc32_arp4_
.type dl_ifft4r_fc32_arp4_,@function
dl_ifft4r_fc32_arp4_:
//esp_err_t dl_ifft4r_fc32_arp4_(float *data, int N, float *table, int table_size)
// table_size - a3
// m - t0
// j - t1
add sp,sp,-16
#
srli t6, a1, 1 // t6 = log4N = N/2
li t0, 2 // t0 - m
div a3, a3, a1 // wind_step = table_size / N
slli a3, a3, 3 // wind_step = complex step = 8 bytes
.ifft2r_l1:
li t1, 0 // t1 - j
srli a1, a1, 2 // a1 = length = length >> 2;
.ifft2r_l2: // loop for j, t1 - j
slli t2, a1, 4 // t2 = length << 1 << 3 (8 bytes for one complex sample)
slli t3, a1, 3 // t2 = length << 1 << 3 (8 bytes for one complex sample)
// start_index = j * (length << 1); // n: n-point FFT
mul t2,t2,t1
add a4, a0, t2 // fc32_t *ptrc0
add a5, a4, t3 // fc32_t *ptrc1
add a6, a5, t3 // fc32_t *ptrc2
add a7, a6, t3 // fc32_t *ptrc3
# flw fa0, 0(a4)
# fsw fa0, 0(t3)
# add t3, t3, 4
mv t2, a2 // winc0
mv t3, a2 // winc0
mv t4, a2 // winc0
esp.lp.setup 0, a1, .ifft2r_l3 // .ifft2r_l3 - label to the last executed instruction
flw fa0, 0(a4) // in0.re
flw fa4, 0(a6) // in2.re
fadd.s ft0, fa0, fa4 // in0.re + in2.re
flw fa1, 4(a4) // in0.im
fsub.s ft1, fa0, fa4 // in0.re - in2.re
flw fa5, 4(a6) // in2.im
fadd.s ft2, fa1, fa5 // in0.im + in2.im
flw fa2, 0(a5) // in1.re
fsub.s ft3, fa1, fa5 // in0.im - in2.im
flw fa6, 0(a7) // in3.re
fadd.s ft4, fa2, fa6 // in1.re + in3.re
flw fa3, 4(a5) // in1.im
fsub.s ft5, fa2, fa6 // in1.re - in3.re
flw fa7, 4(a7) // in3.im
fadd.s ft6, fa3, fa7 // in1.im + in3.im
fsub.s ft7, fa3, fa7 // in1.im - in3.im
# bfly[0].re = ft0 + ft4;
fadd.s fa0, ft0, ft4;
# bfly[0].im = ft2 + ft6;
fadd.s fa1, ft2, ft6;
# bfly[1].re = ft1 - ft7;
fsub.s fa2, ft1, ft7;
# bfly[1].im = ft3 + ft5;
fadd.s fa3, ft3, ft5;
# bfly[2].re = ft0 - ft5;
fsub.s fa4, ft0, ft4;
flw ft0, 0(t2) // winc0->re
# bfly[2].im = ft2 - ft7;
fsub.s fa5, ft2, ft6;
flw ft2, 0(t3) // winc1->re
# bfly[3].re = ft1 + ft6;
fadd.s fa6, ft1, ft7;
flw ft1, 4(t2) // winc0->im
# bfly[3].im = ft3 - ft5;
fsub.s fa7, ft3, ft5;
// *ptrc0 = bfly[0];
fsw fa0, 0(a4) // in0.re
fsw fa1, 4(a4) // in0.im
flw ft3, 4(t3) // winc1->im
// ptrc1->re = bfly[1].re * winc0->re + bfly[1].im * winc0->im;
// ptrc1->im = bfly[1].im * winc0->re - bfly[1].re * winc0->im;
// ptrc2->re = bfly[2].re * winc1->re + bfly[2].im * winc1->im;
fmul.s fa0, fa2, ft0
add t2, t2, a3 // winc0 += 1 * wind_step;
fmul.s fa1, fa3, ft0
fmul.s ft0, fa4, ft2
fmul.s ft2, fa5, ft2
flw ft4, 0(t4) // winc2->re
flw ft5, 4(t4) // winc3->im
fnmsub.s fa0, fa3, ft1, fa0
add t3, t3, a3 // winc1 += 2 * wind_step;
fmadd.s fa1, fa2, ft1, fa1
add t3, t3, a3 //
fmul.s fa2, fa6, ft4
fmul.s fa3, fa7, ft4
add t4, t4, a3 // winc2 += 3 * wind_step;
fnmsub.s ft0, fa5, ft3, ft0
add t4, t4, a3 //
fmadd.s ft2, fa4, ft3, ft2
fnmsub.s ft3, fa7, ft5, fa2
add t4, t4, a3 //
fmadd.s fa3, fa6, ft5, fa3
fsw fa0, 0(a5) // in1.re
add a4, a4, 8
fsw fa1, 4(a5) // in1.im
add a5, a5, 8
fsw ft0, 0(a6) // in2.re
// ptrc2->im = bfly[2].im * winc1->re - bfly[2].re * winc1->im;
fsw ft2, 4(a6) // in2.re
// ptrc3->re = bfly[3].re * winc2->re + bfly[3].im * winc2->im;
add a6, a6, 8
fsw ft3, 0(a7) // in2.re
// ptrc3->im = bfly[3].im * winc2->re - bfly[3].re * winc2->im;
fsw fa3, 4(a7) // in2.re
add a7, a7, 8
// Temp solution
.ifft2r_l3: nop
add t1, t1, 2 // j+=2
BNE t1, t0, .ifft2r_l2
slli t0, t0, 2 // t0 = m = m<<2
srli t6, t6, 2 // t6 = log4N >>= 2
slli a3, a3, 2 // wind_step = wind_step << 2;
BNEZ t6, .ifft2r_l1// Jump if > 0
#
add sp,sp,16
li a0,0
ret