add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,94 @@
/*
* SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
.macro fir_s16_ae32_mul x1, x2, count, ID
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
// x1 - input array1 register (samples)
// x2 - input array2 register (coefficients) the array is inverted and is being decremented
// count - counter register (for example a7)
// count - (samples_count / 4) - 1
// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
// acchi, and acclo have to be initialized before
// Result - acchi || acclo
// Modifies:
// m0, m1, m2, m3
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
/*
* Data schedule. Each line represents instruction and columns represent
* register contents. Last column (MUL) shows the multiplication which
* takes place. Values loaded in the given cycle are shown in square brackets.
*
* m0 m1 m2 m3 MUL
* ----------------- pre-load --------------------------
*[x0 x1] (no MULs in the first 3 instructions)
* x0 x1 [y(N-1) y(N-2)]
* x0 x1 [x2 x3] y(N-1) y(N-2)
* x0 x1 x2 x3 y(N-1) y(N-2) [y(N-3) y(N-4)] x0*y(N-1)
* -------------------- loop ------------------------ (the following 4 instructions are
*[x4 x5] x2 x3 y(N-1) y(N-2) y(N-3) y(N-4) x1*y(N-2) repeated as much as needed)
* x4 x5 x2 x3 [y(N-5) y(M-6)] y(N-3) y(N-4) x2*y(N-3)
* x4 x5 [x6 x7] y(N-5) y(M-6) y(N-3) y(N-4) x3*y(N-4)
* x4 x5 x6 x7 y(N-5) y(M-6) [y(N-7) y(M-8)] x4*y(N-5)
* ------------------- finalize ----------------------
* x4 x5 x6 x7 y(N-5) y(M-6) y(N-7) y(M-8) x5*y(N-6) (nothing is load)
* x4 x5 x6 x7 y(N-5) y(M-6) y(N-7) y(M-8) x6*y(N-7)
* x4 x5 x6 x7 y(N-5) y(M-6) y(N-7) y(M-8) x7*y(N-8)
*/
ldinc m0, \x1
lddec m2, \x2
ldinc m1, \x1
mula.dd.lh.lddec m3, \x2, m0, m2
loopnez \count, .loop_end_\ID
.loop_\ID:
mula.dd.hl.ldinc m0, \x1, m0, m2
mula.dd.lh.lddec m2, \x2, m1, m3
mula.dd.hl.ldinc m1, \x1, m1, m3
mula.dd.lh.lddec m3, \x2, m0, m2
.loop_end_\ID:
mula.dd.hl m0, m2
mula.dd.lh m1, m3
mula.dd.hl m1, m3
.endm // fir_s16_ae32_mul
.macro fir_s16_ae32_full x1, x2, count, full_count, ID
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
// x1 - input array1 register (for example a2)
// x2 - input array2 register (for example a3)
// count - counter register (for example a7)
// count - samples_count / 4 - 1
// full_count - samples_count
// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
// acchi, and acclo have to be initialized before
// Result - acchi || acclo
// Modifies:
// m0, m1, m2, m3
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
// the main mac16 multiplication loop is skipped for cases with less than 4 samples
blti \full_count, 4, .less_than_4_operands_\ID
fir_s16_ae32_mul \x1, \x2, \count, \ID
.less_than_4_operands_\ID:
bbci \full_count, 1, .mod2chk_\ID
ldinc m0, \x1
lddec m2, \x2
mula.dd.hl m0, m2
mula.dd.lh m0, m2
.mod2chk_\ID:
bbci \full_count, 0, .mod1chk_\ID
ldinc m0, \x1
lddec m2, \x2
mula.dd.lh m0, m2
.mod1chk_\ID:
.endm // fir_s16_ae32_full

View File

@@ -0,0 +1,157 @@
/*
* SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_fir.h"
#include "malloc.h"
#include <string.h>
#include "dsp_tests.h"
#define ROUNDING_VALUE 0x7fff
esp_err_t dsps_fird_init_s16(fir_s16_t *fir, int16_t *coeffs, int16_t *delay, int16_t coeffs_len, int16_t decim, int16_t start_pos, int16_t shift)
{
fir->coeffs = coeffs;
fir->delay = delay;
fir->coeffs_len = coeffs_len;
fir->pos = 0;
fir->decim = decim;
fir->d_pos = start_pos;
fir->shift = shift;
fir->rounding_val = (int16_t)(ROUNDING_VALUE);
fir->free_status = 0;
if (fir->coeffs_len < 2) { // number of coeffcients must be higer than 1
return ESP_ERR_DSP_INVALID_LENGTH;
}
if ((fir->shift > 40) || (fir->shift < -40)) { // shift amount must be within a range from -40 to 40
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (fir->d_pos >= fir->decim) { // start position must be lower than decimation
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
#if CONFIG_DSP_OPTIMIZED
// Rounding value buffer primary for a purpose of ee.ld.accx.ip, but used for both the esp32 and esp32s3
// dsps_fird_s16_aexx_free() must be called to free the memory after the FIR function is finished
int32_t *aexx_rounding_buff = (int32_t *)memalign(16, 2 * sizeof(int32_t));
long long rounding = (long long)(fir->rounding_val);
if (fir->shift >= 0) {
rounding = (rounding >> fir->shift);
} else {
rounding = (rounding << (-fir->shift));
}
#if dsps_fird_s16_arp4_enabled
fir->pos = start_pos;
int16_t *new_delay_buff = (int16_t *)memalign(16, (coeffs_len + 8 * 2) * sizeof(int16_t));
for (int i = 0 ; i < (coeffs_len + 8 * 2) ; i++) {
new_delay_buff[i] = 0;
}
fir->delay = &new_delay_buff[8];
fir->free_status |= 0x0001;
#endif // dsps_fird_s16_arp4_enabled
aexx_rounding_buff[0] = (int32_t)(rounding); // 32 lower bits (acclo) type reassignment to 32-bit
aexx_rounding_buff[1] = (int32_t)((rounding >> 32) & 0xFF); // 8 higher bits (acchi) shift by 32 and apply the mask
fir->rounding_buff = aexx_rounding_buff;
fir->free_status |= 0x0004;
#if dsps_fird_s16_aes3_enabled
if (fir->delay == NULL) { // New delay buffer is allocated if the current delay line is NULL
int16_t *new_delay_buff = (int16_t *)memalign(16, coeffs_len * sizeof(int16_t));
fir->delay = new_delay_buff;
fir->free_status |= 0x0001;
} else {
if ((int)fir->delay & 0xf) { // Delay line array must be aligned
return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
}
}
if ((int)fir->coeffs & 0xf) { // Coefficients array must be aligned
return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
}
// If the number of coefficients is not divisible by 8, a new delay line a new coefficients arrays are allocated
// the newly allocated arrays are divisible by 8. Coefficients are copied from the original fir structure to
// the new coeffs array and the remaining space is filled with zeroes
// dsps_fird_s16_free_coeffs_delay must be called to free the memory after the FIR function is finished
if (fir->coeffs_len % 8) { // Number of coefficients must be devisible by 8
int16_t zero_coeffs = (8 - (fir->coeffs_len % 8));
int16_t new_coeffs_len = fir->coeffs_len + zero_coeffs;
int16_t *aes3_delay_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
int16_t *aes3_coeffs_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
for (int i = 0; i < fir->coeffs_len; i++) { // copy fir->coeffs to aes3_coeffs_buff
aes3_coeffs_buff[i] = fir->coeffs[i];
}
for (int i = fir->coeffs_len; i < new_coeffs_len; i++) { // add zeroes to the end
aes3_coeffs_buff[i] = 0;
}
fir->delay = aes3_delay_buff;
fir->coeffs = aes3_coeffs_buff;
fir->coeffs_len = new_coeffs_len;
fir->free_status |= 0x0002;
}
#endif // dsps_fird_s16_aes3_enabled
#endif // CONFIG_DSP_OPTIMIZED
for (int i = 0; i < fir->coeffs_len; i++) { // Initialize the dealy line to zero
fir->delay[i] = 0;
}
return ESP_OK;
}
esp_err_t dsps_fird_s16_aexx_free(fir_s16_t *fir)
{
if (fir->free_status == 0) {
return ESP_OK;
}
if (fir->free_status & 0x0003) {
if (fir->free_status & 0x0002) {
free(fir->coeffs);
}
#if dsps_fird_s16_arp4_enabled
fir->delay = &fir->delay[-8];
#endif
free(fir->delay);
}
if (fir->free_status & 0x0004) {
free(fir->rounding_buff);
}
fir->free_status = 0;
return ESP_OK;
}
esp_err_t dsps_16_array_rev(int16_t *arr, int16_t len)
{
int16_t temp;
for (int i = 0; i < (int)(len / 2); i++) {
temp = arr[i];
arr[i] = arr[len - 1 - i];
arr[len - 1 - i] = temp;
}
return ESP_OK;
}

View File

@@ -0,0 +1,181 @@
/*
* SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_fir_platform.h"
#if (dsps_fird_s16_ae32_enabled == 1)
#include "dsps_fir_s16_m_ae32.S"
// This is FIR filter for ESP32 processor.
.text
.align 4
.global dsps_fird_s16_ae32
.type dsps_fird_s16_ae32,@function
// The function implements the following C code:
//int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
dsps_fird_s16_ae32:
// Input params Variables
//
// fir - a2 N - a6
// input - a3 pos - a7
// output - a4 rounding_lo - a8
// len - a5 d_pos - a9
// &coeffs[N] - a10
// delay - a11
// decim - a12
// rounding_hi - a13
// final_shift - a14 (shift)
entry a1, 32
l16si a7, a2, 10 // a7 - pos
l16si a6, a2, 8 // a6 - N
l32i a10, a2, 0 // a10 - coeffs
addx2 a10, a6, a10 // a10 - coeffs[N+1]
addi a10, a10, -4 // a10 - coeffs[N]
s32i a10, a1, 0 // save pointer to a1
l32i a11, a2, 4 // a11 - delay line
l16si a12, a2, 12 // a12 - decimation
l16si a9, a2, 14 // a9 - d_pos
l16si a14, a2, 16 // a14 - shift
// prepare rounding value
l32i a15, a2, 20 // get address of rounding array to a15
l32i a8, a15, 0 // a8 = lower 32 bits of the rounding value (acclo)
l32i a13, a15, 4 // a13 = higher 8 bits of the rounding value (acchi), offset 4 (32 bits)
// prepare final_shift value
addi a14, a14, -15 // shift - 15
abs a15, a14
blti a15, 32, _shift_lower_than_32_init // check if lower than 32
// greater than 32 could only be negative shift ((-40 to +40) - 15) -> -55 to +25
addi a14, a14, 32 // if greater than 32, add 32 (SRC is not defined for SAR greater than 32)
_shift_lower_than_32_init:
bltz a14, _shift_negative_init // branch if lower than zero (not including zero)
beqz a14, _shift_negative_init // branch if equal to zero (add zero to the previous statement)
ssl a14 // if positive, set SAR register to left shift value (SAR = 32 - shift)
j _end_of_shift_init
_shift_negative_init: // negative shift
abs a14, a14 // absolute value
ssr a14 // SAR = -shift
// final_shift is saved to SAR register, SAR is not being changed during the execution
_end_of_shift_init:
l16si a14, a2, 16 // a14 - load shift value
addi a14, a14, -15 // shift - 15
s32i a5, a1, 4 // save len to a1, used as the return value
// first delay line load (decim - d_pos) when d_pos is not 0
beqz a9, _fird_loop_len
sub a15, a12, a9 // a15 = decim - d_pos
loopnez a15, ._loop_d_pos
blt a7, a6, reset_fir_pos_d_pos // branch if fir->pos >= fir->N
movi.n a7, 0 // fir->pos = 0
l32i a11, a2, 4 // reset delay line to the beginning
reset_fir_pos_d_pos:
l16si a15, a3, 0 // load 16 bits from input (a3) to a15
addi a7, a7, 1 // fir->pos++
s16i a15, a11, 0 // save 16 bits from a15 to delay line (a11)
addi a3, a3, 2 // increment input pointer
addi a11, a11, 2 // increment delay line pointer
._loop_d_pos:
j .fill_delay_line // skip the first iteration of the delay line filling routine
// outer loop
_fird_loop_len:
loopnez a12, .fill_delay_line
blt a7, a6, reset_fir_pos // branch if fir->pos >= fir->N
movi.n a7, 0 // fir->pos = 0
l32i a11, a2, 4 // reset delay line to the beginning
reset_fir_pos:
l16si a15, a3, 0 // load 16 bits from input (a3) to a15
addi a7, a7, 1 // fir->pos++
s16i a15, a11, 0 // save 16 bits from a15 to delay line (a11)
addi a3, a3, 2 // increment input pointer
addi a11, a11, 2 // increment delay line pointer
.fill_delay_line:
// prepare MAC unit
wsr a8, acclo // acclo = a8
wsr a13, acchi // acchi = a13
addi a11, a11, -4 // preset delay line pointer, samples (array is being incremented)
sub a9, a6, a7 // a9 = full_count = fir->N - fir->pos
// (Count / 4) - 1
srli a15, a9, 2 // a15 = count = full_count /4
addi a10, a10, 4 // preset coeffs pointer, samples (array is being decremented)
addi a15, a15, -1 // count - 1
// x1, x2, count, full_count, ID
fir_s16_ae32_full a11, a10, a15, a9, __LINE__
l32i a10, a2, 0 // load coeffs
l32i a11, a2, 4 // reset delay line to the beginning
addx2 a10, a7, a10 // move coeffs pointer to the end
srli a15, a7, 2 // a15 = count = full_count (fir->pos) / 4
addi a11, a11, -4 // preset delay line pointer, samples (array is being incremented)
addi a15, a15, -1 // count - 1
// x1, x2, count, full_count, ID
fir_s16_ae32_full a11, a10, a15, a7, __LINE__
// SAR already set from the beginning to final_shift value
abs a15, a14 // absolute value of shift
l32i a10, a1, 0 // reset coefficient pointer
blti a15, 32, _shift_lower_than_32
rsr a9, acchi // get only higher 8 bits of the acc register
movi.n a15, 0xFF // higher 8 bits mask
and a9, a9, a15 // apply mask
srl a15, a9
j _shift_set
_shift_lower_than_32:
rsr a9, acchi // get higher 8 bits of the acc register
movi.n a11, 0xFF // higher 8 bits mask
rsr a15, acclo // get lower 32 bits of the acc register
and a9, a9, a11 // apply mask
bltz a14, _shift_negative // branch if lower than zero (if negative)
beqz a14, _shift_negative
src a15, a15, a9 // funnel shift left
j _shift_set
_shift_negative: // negative shift
src a15, a9, a15 // funnel shift right
_shift_set:
l32i a11, a2, 4 // Load initial position of the delay line
s16i a15, a4, 0 // save the shifted value to the output array (a4)
addi a5, a5, -1 // len--
addi a4, a4, 2 // increase pointer of the output array
addx2 a11, a7, a11 // p_delay[fir->pos] - (two times the fir->pos)
// counter
bnez a5, _fird_loop_len // break if a5 == 0
l32i.n a2, a1, 4 // load return value to a2
retw.n
#endif // dsps_fird_s16_ae32_enabled

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
/*
* SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_fir.h"
int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
{
int32_t result = 0;
int32_t input_pos = 0;
long long rounding = 0;
const int32_t final_shift = fir->shift - 15;
rounding = (long long)(fir->rounding_val);
if (fir->shift >= 0) {
rounding = (rounding >> fir->shift) & 0xFFFFFFFFFF; // 40-bit mask
} else {
rounding = (rounding << (-fir->shift)) & 0xFFFFFFFFFF; // 40-bit mask
}
// len is already a length of the *output array, calculated as (length of the input array / decimation)
for (int i = 0; i < len; i++) {
for (int j = 0; j < fir->decim - fir->d_pos; j++) {
if (fir->pos >= fir->coeffs_len) {
fir->pos = 0;
}
fir->delay[fir->pos++] = input[input_pos++];
}
fir->d_pos = 0;
long long acc = rounding;
int16_t coeff_pos = fir->coeffs_len - 1;
for (int n = fir->pos; n < fir->coeffs_len ; n++) {
acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
}
for (int n = 0; n < fir->pos ; n++) {
acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
}
if (final_shift > 0) {
output[result++] = (int16_t)(acc << final_shift);
} else {
output[result++] = (int16_t)(acc >> (-final_shift));
}
}
return result;
}

View File

@@ -0,0 +1,150 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_fir_platform.h"
#if (dsps_fird_s16_arp4_enabled == 1)
// This is FIR filter for esp32p4 processor.
.text
.align 4
.global dsps_fird_s16_arp4
.global dsps_fird_s16_ansi
.type dsps_fird_s16_arp4,@function
// The function implements the following C code:
// int32_t dsps_fird_s16_arp4(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
dsps_fird_s16_arp4:
// In case of filter length different then 8*K
lh t2, 8(a0) // t2 - coeffs_len
andi t2, t2, 7
beqz t2, .dsps_fird_s16_arp4_body
j dsps_fird_s16_ansi
.dsps_fird_s16_arp4_body:
add sp,sp,-48
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
sw s3, 12(sp)
sw s4, 16(sp)
sw s5, 20(sp)
sw s6, 24(sp)
sw s7, 28(sp)
sw s8, 32(sp)
sw s9, 36(sp)
sw s10, 40(sp)
sw s11, 44(sp)
// Enable analigned data access
esp.movx.r.cfg t6
or t6, t6, 2
esp.movx.w.cfg t6
lw t1, 4(a0) // t1 - delay_line
lh t2, 8(a0) // t2 - coeffs_len
lh t3, 10(a0) // t3 - pos
lh t6, 16(a0) // t6 - shift
add t6, t6, -15
neg t6, t6
lw t5, 20(a0) // t5 - rounding_buff
lw s2, 4(a0) // s2 - delay_line* current position
add s2, s2, t3 // s2 = delay_line + pos*2
add s2, s2, t3 //
add s4, t2, t2 // s4 = coeff_len*2
add s0, t1, s4 // s0 - &delay[coeffs_len]
lh a4, 0(t1)
.loop_len:
lh t4, 12(a0) // t4 - decim
.loop_decim_copy:
lh s1, 0(a1) // load input data
add a1, a1, 2
sh s1, 0(s2)
add s2, s2, 2 // preincrement of delay line
bgt s0, s2, .skeep_reset
lw s2, 4(a0) // s2 - delay_line
.skeep_reset:
add t4, t4, -1
bgtz t4, .loop_decim_copy
// s5 - count1 = length - pos
// s6 = count1 >> 3 :
sub t3, s2, t1
srli t3, t3, 1 // t3 = (pos*2)>>1
sub s5, t2, t3
srli s6, s5, 3 // s6 = (coeff_len - pos)>>3
srli s7, t3, 3 // s7 = pos>>3
and s8, t3, 0x07 // s8 = pos&0x07
esp.ld.xacc.ip t5, 0 // load rounding value to accx
lw s10, 0(a0) // s10 - coeffs
esp.vld.128.ip q0, s10, 16 //q0 - coeffs
mv s9, s2 // s9 - pointer to delay line
esp.vld.128.ip q1, s9, 16 // q1 - delay line data
beqz s6, .skip_main_loop1
esp.lp.setup 0, s6, .main_loop1
esp.vmulas.s16.xacc.ld.ip q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
.main_loop1: esp.vld.128.ip q1, s9, 16 // Load delay line
.skip_main_loop1: nop
add s9, s9, -16
sub s9, s9, s4
beqz s8, .skip_rest_add
esp.vld.128.ip q2, s9, 16
esp.vadd.s16 q1, q2, q1
esp.vmulas.s16.xacc.ld.ip q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
.skip_rest_add:
esp.vld.128.ip q1, s9, 16
beqz s7, .skip_main_loop3
esp.lp.setup 1, s7, .main_loop3
esp.vmulas.s16.xacc.ld.ip q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
esp.vld.128.ip q1, s9, 16
.main_loop3: nop
.skip_main_loop3: nop
// Shift and Store result
esp.srs.s.xacc s11, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
sh s11, 0(a2) // store result to output buffer
add a2, a2, 2
add a3, a3, -1
bgtz a3, .loop_len
sh t3, 10(a0)
.fast_exit:
mv a0, a6
lw s0, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
lw s3, 12(sp)
lw s4, 16(sp)
lw s5, 20(sp)
lw s6, 24(sp)
lw s7, 28(sp)
lw s8, 32(sp)
lw s9, 36(sp)
lw s10, 40(sp)
lw s11, 44(sp)
add sp,sp,48
ret
#endif //