add some code
This commit is contained in:
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
.macro fir_s16_ae32_mul x1, x2, count, ID
|
||||
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
|
||||
// x1 - input array1 register (samples)
|
||||
// x2 - input array2 register (coefficients) the array is inverted and is being decremented
|
||||
// count - counter register (for example a7)
|
||||
// count - (samples_count / 4) - 1
|
||||
// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
|
||||
// acchi, and acclo have to be initialized before
|
||||
// Result - acchi || acclo
|
||||
// Modifies:
|
||||
// m0, m1, m2, m3
|
||||
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
|
||||
|
||||
/*
|
||||
* Data schedule. Each line represents instruction and columns represent
|
||||
* register contents. Last column (MUL) shows the multiplication which
|
||||
* takes place. Values loaded in the given cycle are shown in square brackets.
|
||||
*
|
||||
* m0 m1 m2 m3 MUL
|
||||
* ----------------- pre-load --------------------------
|
||||
*[x0 x1] (no MULs in the first 3 instructions)
|
||||
* x0 x1 [y(N-1) y(N-2)]
|
||||
* x0 x1 [x2 x3] y(N-1) y(N-2)
|
||||
* x0 x1 x2 x3 y(N-1) y(N-2) [y(N-3) y(N-4)] x0*y(N-1)
|
||||
* -------------------- loop ------------------------ (the following 4 instructions are
|
||||
*[x4 x5] x2 x3 y(N-1) y(N-2) y(N-3) y(N-4) x1*y(N-2) repeated as much as needed)
|
||||
* x4 x5 x2 x3 [y(N-5) y(M-6)] y(N-3) y(N-4) x2*y(N-3)
|
||||
* x4 x5 [x6 x7] y(N-5) y(M-6) y(N-3) y(N-4) x3*y(N-4)
|
||||
* x4 x5 x6 x7 y(N-5) y(M-6) [y(N-7) y(M-8)] x4*y(N-5)
|
||||
* ------------------- finalize ----------------------
|
||||
* x4 x5 x6 x7 y(N-5) y(M-6) y(N-7) y(M-8) x5*y(N-6) (nothing is load)
|
||||
* x4 x5 x6 x7 y(N-5) y(M-6) y(N-7) y(M-8) x6*y(N-7)
|
||||
* x4 x5 x6 x7 y(N-5) y(M-6) y(N-7) y(M-8) x7*y(N-8)
|
||||
*/
|
||||
|
||||
ldinc m0, \x1
|
||||
lddec m2, \x2
|
||||
ldinc m1, \x1
|
||||
|
||||
mula.dd.lh.lddec m3, \x2, m0, m2
|
||||
loopnez \count, .loop_end_\ID
|
||||
.loop_\ID:
|
||||
mula.dd.hl.ldinc m0, \x1, m0, m2
|
||||
mula.dd.lh.lddec m2, \x2, m1, m3
|
||||
mula.dd.hl.ldinc m1, \x1, m1, m3
|
||||
mula.dd.lh.lddec m3, \x2, m0, m2
|
||||
.loop_end_\ID:
|
||||
|
||||
mula.dd.hl m0, m2
|
||||
mula.dd.lh m1, m3
|
||||
mula.dd.hl m1, m3
|
||||
|
||||
.endm // fir_s16_ae32_mul
|
||||
|
||||
.macro fir_s16_ae32_full x1, x2, count, full_count, ID
|
||||
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
|
||||
// x1 - input array1 register (for example a2)
|
||||
// x2 - input array2 register (for example a3)
|
||||
// count - counter register (for example a7)
|
||||
// count - samples_count / 4 - 1
|
||||
// full_count - samples_count
|
||||
// acc += x1[i + 0]*x2[N - i - 1] + x1[i + 1]*x2[N - i - 2] + x1[i + 2]*x2[N - i - 3] + x1[i + 3]*x2[N - i - 4]; i: 0..count
|
||||
// acchi, and acclo have to be initialized before
|
||||
// Result - acchi || acclo
|
||||
// Modifies:
|
||||
// m0, m1, m2, m3
|
||||
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
|
||||
|
||||
// the main mac16 multiplication loop is skipped for cases with less than 4 samples
|
||||
blti \full_count, 4, .less_than_4_operands_\ID
|
||||
fir_s16_ae32_mul \x1, \x2, \count, \ID
|
||||
|
||||
.less_than_4_operands_\ID:
|
||||
|
||||
bbci \full_count, 1, .mod2chk_\ID
|
||||
ldinc m0, \x1
|
||||
lddec m2, \x2
|
||||
mula.dd.hl m0, m2
|
||||
mula.dd.lh m0, m2
|
||||
.mod2chk_\ID:
|
||||
|
||||
bbci \full_count, 0, .mod1chk_\ID
|
||||
ldinc m0, \x1
|
||||
lddec m2, \x2
|
||||
mula.dd.lh m0, m2
|
||||
.mod1chk_\ID:
|
||||
|
||||
.endm // fir_s16_ae32_full
|
||||
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "dsps_fir.h"
|
||||
#include "malloc.h"
|
||||
#include <string.h>
|
||||
#include "dsp_tests.h"
|
||||
|
||||
#define ROUNDING_VALUE 0x7fff
|
||||
|
||||
esp_err_t dsps_fird_init_s16(fir_s16_t *fir, int16_t *coeffs, int16_t *delay, int16_t coeffs_len, int16_t decim, int16_t start_pos, int16_t shift)
|
||||
{
|
||||
fir->coeffs = coeffs;
|
||||
fir->delay = delay;
|
||||
fir->coeffs_len = coeffs_len;
|
||||
fir->pos = 0;
|
||||
fir->decim = decim;
|
||||
fir->d_pos = start_pos;
|
||||
fir->shift = shift;
|
||||
fir->rounding_val = (int16_t)(ROUNDING_VALUE);
|
||||
fir->free_status = 0;
|
||||
|
||||
if (fir->coeffs_len < 2) { // number of coeffcients must be higer than 1
|
||||
return ESP_ERR_DSP_INVALID_LENGTH;
|
||||
}
|
||||
|
||||
if ((fir->shift > 40) || (fir->shift < -40)) { // shift amount must be within a range from -40 to 40
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
if (fir->d_pos >= fir->decim) { // start position must be lower than decimation
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
#if CONFIG_DSP_OPTIMIZED
|
||||
|
||||
// Rounding value buffer primary for a purpose of ee.ld.accx.ip, but used for both the esp32 and esp32s3
|
||||
// dsps_fird_s16_aexx_free() must be called to free the memory after the FIR function is finished
|
||||
int32_t *aexx_rounding_buff = (int32_t *)memalign(16, 2 * sizeof(int32_t));
|
||||
|
||||
long long rounding = (long long)(fir->rounding_val);
|
||||
|
||||
if (fir->shift >= 0) {
|
||||
rounding = (rounding >> fir->shift);
|
||||
} else {
|
||||
rounding = (rounding << (-fir->shift));
|
||||
}
|
||||
#if dsps_fird_s16_arp4_enabled
|
||||
fir->pos = start_pos;
|
||||
|
||||
int16_t *new_delay_buff = (int16_t *)memalign(16, (coeffs_len + 8 * 2) * sizeof(int16_t));
|
||||
for (int i = 0 ; i < (coeffs_len + 8 * 2) ; i++) {
|
||||
new_delay_buff[i] = 0;
|
||||
}
|
||||
fir->delay = &new_delay_buff[8];
|
||||
fir->free_status |= 0x0001;
|
||||
|
||||
#endif // dsps_fird_s16_arp4_enabled
|
||||
|
||||
|
||||
aexx_rounding_buff[0] = (int32_t)(rounding); // 32 lower bits (acclo) type reassignment to 32-bit
|
||||
aexx_rounding_buff[1] = (int32_t)((rounding >> 32) & 0xFF); // 8 higher bits (acchi) shift by 32 and apply the mask
|
||||
fir->rounding_buff = aexx_rounding_buff;
|
||||
fir->free_status |= 0x0004;
|
||||
|
||||
#if dsps_fird_s16_aes3_enabled
|
||||
|
||||
if (fir->delay == NULL) { // New delay buffer is allocated if the current delay line is NULL
|
||||
int16_t *new_delay_buff = (int16_t *)memalign(16, coeffs_len * sizeof(int16_t));
|
||||
fir->delay = new_delay_buff;
|
||||
fir->free_status |= 0x0001;
|
||||
} else {
|
||||
if ((int)fir->delay & 0xf) { // Delay line array must be aligned
|
||||
return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
|
||||
}
|
||||
}
|
||||
|
||||
if ((int)fir->coeffs & 0xf) { // Coefficients array must be aligned
|
||||
return ESP_ERR_DSP_ARRAY_NOT_ALIGNED;
|
||||
}
|
||||
|
||||
// If the number of coefficients is not divisible by 8, a new delay line a new coefficients arrays are allocated
|
||||
// the newly allocated arrays are divisible by 8. Coefficients are copied from the original fir structure to
|
||||
// the new coeffs array and the remaining space is filled with zeroes
|
||||
// dsps_fird_s16_free_coeffs_delay must be called to free the memory after the FIR function is finished
|
||||
if (fir->coeffs_len % 8) { // Number of coefficients must be devisible by 8
|
||||
int16_t zero_coeffs = (8 - (fir->coeffs_len % 8));
|
||||
int16_t new_coeffs_len = fir->coeffs_len + zero_coeffs;
|
||||
int16_t *aes3_delay_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
|
||||
int16_t *aes3_coeffs_buff = (int16_t *)memalign(16, new_coeffs_len * sizeof(int16_t));
|
||||
|
||||
for (int i = 0; i < fir->coeffs_len; i++) { // copy fir->coeffs to aes3_coeffs_buff
|
||||
aes3_coeffs_buff[i] = fir->coeffs[i];
|
||||
}
|
||||
|
||||
for (int i = fir->coeffs_len; i < new_coeffs_len; i++) { // add zeroes to the end
|
||||
aes3_coeffs_buff[i] = 0;
|
||||
}
|
||||
|
||||
fir->delay = aes3_delay_buff;
|
||||
fir->coeffs = aes3_coeffs_buff;
|
||||
fir->coeffs_len = new_coeffs_len;
|
||||
fir->free_status |= 0x0002;
|
||||
}
|
||||
|
||||
#endif // dsps_fird_s16_aes3_enabled
|
||||
#endif // CONFIG_DSP_OPTIMIZED
|
||||
|
||||
for (int i = 0; i < fir->coeffs_len; i++) { // Initialize the dealy line to zero
|
||||
fir->delay[i] = 0;
|
||||
}
|
||||
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
esp_err_t dsps_fird_s16_aexx_free(fir_s16_t *fir)
|
||||
{
|
||||
|
||||
if (fir->free_status == 0) {
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
if (fir->free_status & 0x0003) {
|
||||
|
||||
if (fir->free_status & 0x0002) {
|
||||
free(fir->coeffs);
|
||||
}
|
||||
#if dsps_fird_s16_arp4_enabled
|
||||
fir->delay = &fir->delay[-8];
|
||||
#endif
|
||||
free(fir->delay);
|
||||
}
|
||||
|
||||
if (fir->free_status & 0x0004) {
|
||||
free(fir->rounding_buff);
|
||||
}
|
||||
fir->free_status = 0;
|
||||
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
|
||||
esp_err_t dsps_16_array_rev(int16_t *arr, int16_t len)
|
||||
{
|
||||
|
||||
int16_t temp;
|
||||
|
||||
for (int i = 0; i < (int)(len / 2); i++) {
|
||||
temp = arr[i];
|
||||
arr[i] = arr[len - 1 - i];
|
||||
arr[len - 1 - i] = temp;
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "dsps_fir_platform.h"
|
||||
#if (dsps_fird_s16_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_fir_s16_m_ae32.S"
|
||||
|
||||
// This is FIR filter for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_fird_s16_ae32
|
||||
.type dsps_fird_s16_ae32,@function
|
||||
// The function implements the following C code:
|
||||
//int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
|
||||
|
||||
|
||||
dsps_fird_s16_ae32:
|
||||
// Input params Variables
|
||||
//
|
||||
// fir - a2 N - a6
|
||||
// input - a3 pos - a7
|
||||
// output - a4 rounding_lo - a8
|
||||
// len - a5 d_pos - a9
|
||||
// &coeffs[N] - a10
|
||||
// delay - a11
|
||||
// decim - a12
|
||||
// rounding_hi - a13
|
||||
// final_shift - a14 (shift)
|
||||
|
||||
entry a1, 32
|
||||
|
||||
l16si a7, a2, 10 // a7 - pos
|
||||
l16si a6, a2, 8 // a6 - N
|
||||
l32i a10, a2, 0 // a10 - coeffs
|
||||
addx2 a10, a6, a10 // a10 - coeffs[N+1]
|
||||
addi a10, a10, -4 // a10 - coeffs[N]
|
||||
s32i a10, a1, 0 // save pointer to a1
|
||||
l32i a11, a2, 4 // a11 - delay line
|
||||
l16si a12, a2, 12 // a12 - decimation
|
||||
l16si a9, a2, 14 // a9 - d_pos
|
||||
l16si a14, a2, 16 // a14 - shift
|
||||
|
||||
// prepare rounding value
|
||||
l32i a15, a2, 20 // get address of rounding array to a15
|
||||
l32i a8, a15, 0 // a8 = lower 32 bits of the rounding value (acclo)
|
||||
l32i a13, a15, 4 // a13 = higher 8 bits of the rounding value (acchi), offset 4 (32 bits)
|
||||
|
||||
// prepare final_shift value
|
||||
addi a14, a14, -15 // shift - 15
|
||||
abs a15, a14
|
||||
blti a15, 32, _shift_lower_than_32_init // check if lower than 32
|
||||
|
||||
// greater than 32 could only be negative shift ((-40 to +40) - 15) -> -55 to +25
|
||||
addi a14, a14, 32 // if greater than 32, add 32 (SRC is not defined for SAR greater than 32)
|
||||
_shift_lower_than_32_init:
|
||||
|
||||
bltz a14, _shift_negative_init // branch if lower than zero (not including zero)
|
||||
beqz a14, _shift_negative_init // branch if equal to zero (add zero to the previous statement)
|
||||
ssl a14 // if positive, set SAR register to left shift value (SAR = 32 - shift)
|
||||
|
||||
j _end_of_shift_init
|
||||
|
||||
_shift_negative_init: // negative shift
|
||||
abs a14, a14 // absolute value
|
||||
ssr a14 // SAR = -shift
|
||||
// final_shift is saved to SAR register, SAR is not being changed during the execution
|
||||
|
||||
_end_of_shift_init:
|
||||
l16si a14, a2, 16 // a14 - load shift value
|
||||
addi a14, a14, -15 // shift - 15
|
||||
|
||||
s32i a5, a1, 4 // save len to a1, used as the return value
|
||||
|
||||
|
||||
// first delay line load (decim - d_pos) when d_pos is not 0
|
||||
beqz a9, _fird_loop_len
|
||||
sub a15, a12, a9 // a15 = decim - d_pos
|
||||
|
||||
loopnez a15, ._loop_d_pos
|
||||
|
||||
blt a7, a6, reset_fir_pos_d_pos // branch if fir->pos >= fir->N
|
||||
movi.n a7, 0 // fir->pos = 0
|
||||
l32i a11, a2, 4 // reset delay line to the beginning
|
||||
reset_fir_pos_d_pos:
|
||||
|
||||
l16si a15, a3, 0 // load 16 bits from input (a3) to a15
|
||||
addi a7, a7, 1 // fir->pos++
|
||||
s16i a15, a11, 0 // save 16 bits from a15 to delay line (a11)
|
||||
addi a3, a3, 2 // increment input pointer
|
||||
addi a11, a11, 2 // increment delay line pointer
|
||||
._loop_d_pos:
|
||||
|
||||
j .fill_delay_line // skip the first iteration of the delay line filling routine
|
||||
|
||||
// outer loop
|
||||
_fird_loop_len:
|
||||
|
||||
loopnez a12, .fill_delay_line
|
||||
|
||||
blt a7, a6, reset_fir_pos // branch if fir->pos >= fir->N
|
||||
movi.n a7, 0 // fir->pos = 0
|
||||
l32i a11, a2, 4 // reset delay line to the beginning
|
||||
reset_fir_pos:
|
||||
|
||||
l16si a15, a3, 0 // load 16 bits from input (a3) to a15
|
||||
addi a7, a7, 1 // fir->pos++
|
||||
s16i a15, a11, 0 // save 16 bits from a15 to delay line (a11)
|
||||
addi a3, a3, 2 // increment input pointer
|
||||
addi a11, a11, 2 // increment delay line pointer
|
||||
.fill_delay_line:
|
||||
|
||||
// prepare MAC unit
|
||||
wsr a8, acclo // acclo = a8
|
||||
wsr a13, acchi // acchi = a13
|
||||
|
||||
addi a11, a11, -4 // preset delay line pointer, samples (array is being incremented)
|
||||
sub a9, a6, a7 // a9 = full_count = fir->N - fir->pos
|
||||
|
||||
// (Count / 4) - 1
|
||||
srli a15, a9, 2 // a15 = count = full_count /4
|
||||
addi a10, a10, 4 // preset coeffs pointer, samples (array is being decremented)
|
||||
addi a15, a15, -1 // count - 1
|
||||
|
||||
// x1, x2, count, full_count, ID
|
||||
fir_s16_ae32_full a11, a10, a15, a9, __LINE__
|
||||
|
||||
l32i a10, a2, 0 // load coeffs
|
||||
l32i a11, a2, 4 // reset delay line to the beginning
|
||||
addx2 a10, a7, a10 // move coeffs pointer to the end
|
||||
|
||||
srli a15, a7, 2 // a15 = count = full_count (fir->pos) / 4
|
||||
addi a11, a11, -4 // preset delay line pointer, samples (array is being incremented)
|
||||
addi a15, a15, -1 // count - 1
|
||||
|
||||
// x1, x2, count, full_count, ID
|
||||
fir_s16_ae32_full a11, a10, a15, a7, __LINE__
|
||||
|
||||
// SAR already set from the beginning to final_shift value
|
||||
abs a15, a14 // absolute value of shift
|
||||
l32i a10, a1, 0 // reset coefficient pointer
|
||||
blti a15, 32, _shift_lower_than_32
|
||||
rsr a9, acchi // get only higher 8 bits of the acc register
|
||||
movi.n a15, 0xFF // higher 8 bits mask
|
||||
and a9, a9, a15 // apply mask
|
||||
srl a15, a9
|
||||
j _shift_set
|
||||
|
||||
_shift_lower_than_32:
|
||||
rsr a9, acchi // get higher 8 bits of the acc register
|
||||
movi.n a11, 0xFF // higher 8 bits mask
|
||||
rsr a15, acclo // get lower 32 bits of the acc register
|
||||
and a9, a9, a11 // apply mask
|
||||
|
||||
|
||||
bltz a14, _shift_negative // branch if lower than zero (if negative)
|
||||
beqz a14, _shift_negative
|
||||
src a15, a15, a9 // funnel shift left
|
||||
j _shift_set
|
||||
|
||||
_shift_negative: // negative shift
|
||||
src a15, a9, a15 // funnel shift right
|
||||
|
||||
_shift_set:
|
||||
|
||||
l32i a11, a2, 4 // Load initial position of the delay line
|
||||
s16i a15, a4, 0 // save the shifted value to the output array (a4)
|
||||
addi a5, a5, -1 // len--
|
||||
addi a4, a4, 2 // increase pointer of the output array
|
||||
addx2 a11, a7, a11 // p_delay[fir->pos] - (two times the fir->pos)
|
||||
|
||||
// counter
|
||||
bnez a5, _fird_loop_len // break if a5 == 0
|
||||
|
||||
l32i.n a2, a1, 4 // load return value to a2
|
||||
retw.n
|
||||
|
||||
#endif // dsps_fird_s16_ae32_enabled
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "dsps_fir.h"
|
||||
|
||||
int32_t dsps_fird_s16_ansi(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len)
|
||||
{
|
||||
int32_t result = 0;
|
||||
int32_t input_pos = 0;
|
||||
long long rounding = 0;
|
||||
const int32_t final_shift = fir->shift - 15;
|
||||
|
||||
rounding = (long long)(fir->rounding_val);
|
||||
|
||||
if (fir->shift >= 0) {
|
||||
rounding = (rounding >> fir->shift) & 0xFFFFFFFFFF; // 40-bit mask
|
||||
} else {
|
||||
rounding = (rounding << (-fir->shift)) & 0xFFFFFFFFFF; // 40-bit mask
|
||||
}
|
||||
|
||||
// len is already a length of the *output array, calculated as (length of the input array / decimation)
|
||||
for (int i = 0; i < len; i++) {
|
||||
|
||||
for (int j = 0; j < fir->decim - fir->d_pos; j++) {
|
||||
|
||||
if (fir->pos >= fir->coeffs_len) {
|
||||
fir->pos = 0;
|
||||
}
|
||||
fir->delay[fir->pos++] = input[input_pos++];
|
||||
}
|
||||
fir->d_pos = 0;
|
||||
|
||||
long long acc = rounding;
|
||||
int16_t coeff_pos = fir->coeffs_len - 1;
|
||||
|
||||
for (int n = fir->pos; n < fir->coeffs_len ; n++) {
|
||||
acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
|
||||
}
|
||||
for (int n = 0; n < fir->pos ; n++) {
|
||||
acc += (int32_t)fir->coeffs[coeff_pos--] * (int32_t)fir->delay[n];
|
||||
}
|
||||
|
||||
if (final_shift > 0) {
|
||||
output[result++] = (int16_t)(acc << final_shift);
|
||||
} else {
|
||||
output[result++] = (int16_t)(acc >> (-final_shift));
|
||||
}
|
||||
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_fir_platform.h"
|
||||
#if (dsps_fird_s16_arp4_enabled == 1)
|
||||
|
||||
// This is FIR filter for esp32p4 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_fird_s16_arp4
|
||||
.global dsps_fird_s16_ansi
|
||||
.type dsps_fird_s16_arp4,@function
|
||||
// The function implements the following C code:
|
||||
// int32_t dsps_fird_s16_arp4(fir_s16_t *fir, const int16_t *input, int16_t *output, int32_t len);
|
||||
|
||||
dsps_fird_s16_arp4:
|
||||
// In case of filter length different then 8*K
|
||||
lh t2, 8(a0) // t2 - coeffs_len
|
||||
andi t2, t2, 7
|
||||
beqz t2, .dsps_fird_s16_arp4_body
|
||||
j dsps_fird_s16_ansi
|
||||
|
||||
.dsps_fird_s16_arp4_body:
|
||||
add sp,sp,-48
|
||||
sw s0, 0(sp)
|
||||
sw s1, 4(sp)
|
||||
sw s2, 8(sp)
|
||||
sw s3, 12(sp)
|
||||
sw s4, 16(sp)
|
||||
sw s5, 20(sp)
|
||||
sw s6, 24(sp)
|
||||
sw s7, 28(sp)
|
||||
sw s8, 32(sp)
|
||||
sw s9, 36(sp)
|
||||
sw s10, 40(sp)
|
||||
sw s11, 44(sp)
|
||||
|
||||
// Enable analigned data access
|
||||
esp.movx.r.cfg t6
|
||||
or t6, t6, 2
|
||||
esp.movx.w.cfg t6
|
||||
|
||||
lw t1, 4(a0) // t1 - delay_line
|
||||
lh t2, 8(a0) // t2 - coeffs_len
|
||||
lh t3, 10(a0) // t3 - pos
|
||||
lh t6, 16(a0) // t6 - shift
|
||||
add t6, t6, -15
|
||||
neg t6, t6
|
||||
lw t5, 20(a0) // t5 - rounding_buff
|
||||
lw s2, 4(a0) // s2 - delay_line* current position
|
||||
add s2, s2, t3 // s2 = delay_line + pos*2
|
||||
add s2, s2, t3 //
|
||||
add s4, t2, t2 // s4 = coeff_len*2
|
||||
add s0, t1, s4 // s0 - &delay[coeffs_len]
|
||||
|
||||
lh a4, 0(t1)
|
||||
.loop_len:
|
||||
lh t4, 12(a0) // t4 - decim
|
||||
.loop_decim_copy:
|
||||
lh s1, 0(a1) // load input data
|
||||
add a1, a1, 2
|
||||
|
||||
sh s1, 0(s2)
|
||||
add s2, s2, 2 // preincrement of delay line
|
||||
bgt s0, s2, .skeep_reset
|
||||
lw s2, 4(a0) // s2 - delay_line
|
||||
.skeep_reset:
|
||||
add t4, t4, -1
|
||||
bgtz t4, .loop_decim_copy
|
||||
|
||||
// s5 - count1 = length - pos
|
||||
// s6 = count1 >> 3 :
|
||||
sub t3, s2, t1
|
||||
srli t3, t3, 1 // t3 = (pos*2)>>1
|
||||
sub s5, t2, t3
|
||||
srli s6, s5, 3 // s6 = (coeff_len - pos)>>3
|
||||
|
||||
srli s7, t3, 3 // s7 = pos>>3
|
||||
and s8, t3, 0x07 // s8 = pos&0x07
|
||||
|
||||
esp.ld.xacc.ip t5, 0 // load rounding value to accx
|
||||
|
||||
lw s10, 0(a0) // s10 - coeffs
|
||||
esp.vld.128.ip q0, s10, 16 //q0 - coeffs
|
||||
mv s9, s2 // s9 - pointer to delay line
|
||||
esp.vld.128.ip q1, s9, 16 // q1 - delay line data
|
||||
|
||||
beqz s6, .skip_main_loop1
|
||||
esp.lp.setup 0, s6, .main_loop1
|
||||
esp.vmulas.s16.xacc.ld.ip q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
|
||||
.main_loop1: esp.vld.128.ip q1, s9, 16 // Load delay line
|
||||
.skip_main_loop1: nop
|
||||
|
||||
|
||||
add s9, s9, -16
|
||||
sub s9, s9, s4
|
||||
beqz s8, .skip_rest_add
|
||||
esp.vld.128.ip q2, s9, 16
|
||||
esp.vadd.s16 q1, q2, q1
|
||||
esp.vmulas.s16.xacc.ld.ip q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
|
||||
.skip_rest_add:
|
||||
esp.vld.128.ip q1, s9, 16
|
||||
|
||||
beqz s7, .skip_main_loop3
|
||||
esp.lp.setup 1, s7, .main_loop3
|
||||
esp.vmulas.s16.xacc.ld.ip q0, s10, 16, q0, q1 // q0 - coeffs, q1 - data
|
||||
esp.vld.128.ip q1, s9, 16
|
||||
.main_loop3: nop
|
||||
.skip_main_loop3: nop
|
||||
|
||||
// Shift and Store result
|
||||
esp.srs.s.xacc s11, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
|
||||
sh s11, 0(a2) // store result to output buffer
|
||||
add a2, a2, 2
|
||||
|
||||
add a3, a3, -1
|
||||
bgtz a3, .loop_len
|
||||
sh t3, 10(a0)
|
||||
|
||||
.fast_exit:
|
||||
mv a0, a6
|
||||
|
||||
lw s0, 0(sp)
|
||||
lw s1, 4(sp)
|
||||
lw s2, 8(sp)
|
||||
lw s3, 12(sp)
|
||||
lw s4, 16(sp)
|
||||
lw s5, 20(sp)
|
||||
lw s6, 24(sp)
|
||||
lw s7, 28(sp)
|
||||
lw s8, 32(sp)
|
||||
lw s9, 36(sp)
|
||||
lw s10, 40(sp)
|
||||
lw s11, 44(sp)
|
||||
|
||||
add sp,sp,48
|
||||
ret
|
||||
|
||||
#endif //
|
||||
Reference in New Issue
Block a user