add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,120 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_cplx_gen_platform.h"
#if (dsps_cplx_gen_aes3_enbled || dsps_cplx_gen_ae32_enbled)
// This is a Complex signal generator for ESP32 processor.
.text
.align 4
.global dsps_cplx_gen_ae32
.type dsps_cplx_gen_ae32,@function
// The function implements the following C code:
// esp_err_t dsps_cplx_gen_ae32(cplx_sig_t *cplx_gen, void *output, int32_t len);
dsps_cplx_gen_ae32:
// Input params Variables float Variables fixed
//
// cplx_gen - a2 fr - f0 lut - a5
// output - a3 one_const - f1 lut_len - a6
// len - a4 lut_len_f - f2 sin_pos - a7
// ph_f - f3 cos_pos - a8
// sin_pos_f - f4 sin_to_cos - a9
// ph_floor - a10
// modulo - a11
entry a1, 32
l32i a5, a2, 0 // a5 - lut
l32i a6, a2, 4 // a6 - lut_len
lsi f0, a2, 8 // f0 - fr
lsi f3, a2, 12 // f3 - ph_f (phase increment)
const.s f1, 1 // f1 - constant 1
float.s f2, a6, 0 // f2 - lut_len_f
srli a9, a6, 2 // a9 - sin_to_cos = lut_len / 4
addi a11, a6, -1 // a11 - modulo = lut_len - 1
l32i a15, a2, 16 // a15 - d_type
beqz a15, _s16_fixed
// F32 floating point
loopnez a4, ._main_loop_float
floor.s a10, f3, 0 // turncate wiht rounding towards -infinity
// branch if ph_floor is greater than 0
bgez a10, _ph_check_low_float
add.s f3, f3, f1 // f3 = f3 - f1 (ph_f + 1)
floor.s a10, f3, 0 // turncate wiht rounding towards -infinity
_ph_check_low_float:
// branch if ph_ceil is lower than 2 (floored to 1)
blti a10, 1, _ph_check_great_float
sub.s f3, f3, f1 // f3 = f3 - f1 (ph_f - 1)
_ph_check_great_float:
mul.s f4, f3, f2 // sin_pos_f = ph_f * lut_len
trunc.s a7, f4, 0 // truncate sin_pos_f to sin_pos
add a8, a7, a9 // cos_pos (a8) = sin_pos(a7) + sin_to_cos(a9)
and a8, a8, a11 // cos_pos = cos_pos & modulo (lut_len - 1)
slli a8, a8, 2 // set index of the LUT (4 x cos_pos)
slli a7, a7, 2 // set index of the LUT (4 x sin_pos)
lsx f14, a5, a7 // load sin LUT value form *lut
lsx f15, a5, a8 // load cos LUT value form *lut
ssi f15, a3, 0 // save cos LUT value to the output, offset 0
ssi f14, a3, 4 // save sin LUT value to the output, offset 4
add.s f3, f3, f0 // ph_f += fr
addi.n a3, a3, 8 // increase the output pointer (2 x f32)
._main_loop_float:
movi.n a2, 0
retw.n
// Q15 fixed point
_s16_fixed:
loopnez a4, ._main_loop_fixed
floor.s a10, f3, 0 // turncate wiht rounding towards -infinity
// branch if ph_floor is greater than 0
bgez a10, _ph_check_low_fixed
add.s f3, f3, f1 // f3 = f3 - f1 (ph_f + 1)
floor.s a10, f3, 0 // turncate wiht rounding towards -infinity
_ph_check_low_fixed:
// branch if ph_ceil is lower than 2 (floored to 1)
blti a10, 1, _ph_check_great_fixed
sub.s f3, f3, f1 // f3 = f3 - f1 (ph_f - 1)
_ph_check_great_fixed:
mul.s f4, f3, f2 // sin_pos_f = ph_f * lut_len
trunc.s a7, f4, 0 // truncate sin_pos_f to sin_pos
add a8, a7, a9 // cos_pos (a8) = sin_pos(a7) + sin_to_cos(a9)
and a8, a8, a11 // cos_pos = cos_pos & modulo (lut_len - 1)
addx2 a15, a8, a5 // get cos index of the LUT (*lut + 2 x cos_pos)
addx2 a13, a7, a5 // get sin index of the LUT (*lut + 2 x sin_pos)
l16si a14, a15, 0 // load cos LUT value from *lut
l16si a12, a13, 0 // load sin LUT value from *lut
s16i a14, a3, 0 // save cos LUT value to the output (a3), offset 0
s16i a12, a3, 2 // save sin LUT value to the output (a3), offset 2
add.s f3, f3, f0 // ph_f += fr
addi.n a3, a3, 4 // increase the output pointer (2 x s16)
._main_loop_fixed:
movi.n a2, 0
retw.n
#endif // (dsps_cplx_gen_aes3_enbled || dsps_cplx_gen_ae32_enbled)

View File

@@ -0,0 +1,40 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_cplx_gen.h"
esp_err_t dsps_cplx_gen_ansi(cplx_sig_t *cplx_gen, void *output, int32_t len)
{
// angle frequency is already cplx_gen->freq
const int sin_to_cos = cplx_gen->lut_len / 4;
float ph = cplx_gen->phase;
const float fr = cplx_gen->freq;
int sin_pos, cos_pos;
for (int i = 0 ; i < len; i++) {
if (ph < 0) {
ph += 1.0;
}
if (ph >= 1.0) {
ph -= 1.0;
}
sin_pos = (int)(ph * (cplx_gen->lut_len));
cos_pos = (sin_pos + sin_to_cos) & (cplx_gen->lut_len - 1);
if (cplx_gen->d_type == S16_FIXED) {
((int16_t *)output)[i * 2 + 0] = ((int16_t *)cplx_gen->lut)[cos_pos];
((int16_t *)output)[i * 2 + 1] = ((int16_t *)cplx_gen->lut)[sin_pos];
} else {
((float *)output)[i * 2 + 0] = ((float *)cplx_gen->lut)[cos_pos];
((float *)output)[i * 2 + 1] = ((float *)cplx_gen->lut)[sin_pos];
}
ph += fr;
}
return ESP_OK;
}

View File

@@ -0,0 +1,148 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_cplx_gen.h"
#include "dsp_common.h"
#include "esp_log.h"
#include <math.h>
#include <malloc.h>
#define Q15_MAX INT16_MAX
static const char *TAG = "dsps_cplx_gen";
esp_err_t dsps_cplx_gen_init(cplx_sig_t *cplx_gen, out_d_type d_type, void *lut, int32_t lut_len, float freq, float initial_phase)
{
cplx_gen->lut_len = lut_len;
cplx_gen->freq = freq;
cplx_gen->lut = lut;
cplx_gen->free_status = 0;
cplx_gen->d_type = d_type;
cplx_gen->phase = initial_phase;
// length of the LUT must be power of 2
if (!dsp_is_power_of_two(lut_len)) {
ESP_LOGE(TAG, "The length of the LUT must be power of 2");
return ESP_ERR_DSP_INVALID_LENGTH;
}
// LUT length must be in a range from 256 to 8192
if ((lut == NULL) && ((cplx_gen->lut_len > 8192) || (cplx_gen->lut_len < 256))) {
ESP_LOGE(TAG, "The length of the LUT table out of range. Valid range is 256 to 8192");
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
// frequency is a Nyquist frequency, must be in a range from (-1 to 1)
if ((cplx_gen->freq >= 1) || (cplx_gen->freq <= -1)) {
ESP_LOGE(TAG, "The frequency is out of range. Valid range is +/- 1. ");
return ESP_ERR_DSP_INVALID_PARAM;
}
// initial phase in a range from (-1 to 1)
if ((cplx_gen->phase >= 1) || (cplx_gen->phase <= -1)) {
ESP_LOGE(TAG, "The phase is out of range. Valid range is +/- 1. ");
return ESP_ERR_DSP_INVALID_PARAM;
}
// LUT table coefficients generation
if (lut == NULL) { // lut has not been provided by an user. Allocate and initialize it
cplx_gen->free_status |= 0x0001; // lut has been allocated, free_status indicates that the space must be freed afterwards
if (cplx_gen->d_type == S16_FIXED) { // Q15 fixed point
int16_t *local_lut = (int16_t *)malloc(cplx_gen->lut_len * sizeof(int16_t));
float term;
for (int i = 0 ; i < cplx_gen->lut_len; i++) {
term = (2.0 * M_PI) * ((float)(i) / (float)(cplx_gen->lut_len));
local_lut[i] = (int16_t)(sin(term) * Q15_MAX); // conversion to Q15 fixed point
}
cplx_gen->lut = (void *)local_lut;
} else if (cplx_gen->d_type == F32_FLOAT) { // Single precision floating point
float *local_lut = (float *)malloc(cplx_gen->lut_len * sizeof(float));
float term;
for (int i = 0 ; i < cplx_gen->lut_len; i++) {
term = (2.0 * M_PI) * ((float)(i) / (float)(cplx_gen->lut_len));
local_lut[i] = (float)sin(term);
}
cplx_gen->lut = (void *)local_lut;
} else {
cplx_gen->lut = NULL;
return ESP_ERR_DSP_INVALID_PARAM;
}
}
return ESP_OK;
}
esp_err_t dsps_cplx_gen_freq_set(cplx_sig_t *cplx_gen, float freq)
{
if ((freq >= 1) || (freq <= -1)) { // frequency is a Nyquist frequency, must be in a range from (-1 to 1)
ESP_LOGE(TAG, "The frequency is out of range. Valid range is +/- 1. ");
return ESP_ERR_DSP_INVALID_PARAM;
}
cplx_gen->freq = freq;
return ESP_OK;
}
float dsps_cplx_gen_freq_get(cplx_sig_t *cplx_gen)
{
// Check if the structure was initialized
if (!dsp_is_power_of_two(cplx_gen->lut_len)) {
ESP_LOGE(TAG, "cplx_gen strucure was not initialized");
return -2;
}
return (cplx_gen->freq);
}
esp_err_t dsps_cplx_gen_phase_set(cplx_sig_t *cplx_gen, float phase)
{
if ((phase >= 1) || (phase <= -1)) { // initial phase in a range from (-1 to 1)
ESP_LOGE(TAG, "The phase is out of range. Valid range is +/- 1. ");
return ESP_ERR_DSP_INVALID_PARAM;
}
cplx_gen->phase = phase;
return ESP_OK;
}
float dsps_cplx_gen_phase_get(cplx_sig_t *cplx_gen)
{
// Check if the structure was initialized
if (!dsp_is_power_of_two(cplx_gen->lut_len)) {
ESP_LOGE(TAG, "cplx_gen strucure was not initialized");
return -2;
}
return (cplx_gen->phase);
}
esp_err_t dsps_cplx_gen_set(cplx_sig_t *cplx_gen, float freq, float phase)
{
if ((freq >= 1) || (freq <= -1)) { // frequency is a Nyquist frequency, must be in a range from (-1 to 1)
ESP_LOGE(TAG, "The frequency is out of range. Valid range is +/- 1. ");
return ESP_ERR_DSP_INVALID_PARAM;
}
if ((phase >= 1) || (phase <= -1)) { // phase in a range from (-1 to 1)
ESP_LOGE(TAG, "The phase is out of range. Valid range is +/- 1. ");
return ESP_ERR_DSP_INVALID_PARAM;
}
cplx_gen->phase = phase;
cplx_gen->freq = freq;
return ESP_OK;
}
void cplx_gen_free(cplx_sig_t *cplx_gen)
{
if (cplx_gen->free_status & 0x0001) {
free(cplx_gen->lut);
cplx_gen->free_status = 0;
}
}

View File

@@ -0,0 +1,260 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <malloc.h>
#include <stdint.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "esp_dsp.h"
#include <math.h>
#include "dsp_tests.h"
#include "dsps_cplx_gen.h"
#include "dsps_wind.h"
#include "dsps_view.h"
#include "dsps_fft2r.h"
#define LEAKAGE_BINS 10 // fft leakage bins
static const char *TAG = "dsps_cplx_gen";
// Error message handler function, which detects errors returned by dsps_cplx_gen_init() function
void error_msg_handler(cplx_sig_t *cplx_signal, esp_err_t status)
{
if (status != ESP_OK) {
cplx_gen_free(cplx_signal);
switch (status) {
case ESP_ERR_DSP_INVALID_LENGTH:
TEST_ASSERT_MESSAGE(false, "LUT table has invalid length, must be power of 2");
break;
case ESP_ERR_DSP_PARAM_OUTOFRANGE:
TEST_ASSERT_MESSAGE(false, "LUT table length must be in a range from 256 to 8192");
break;
case ESP_ERR_DSP_INVALID_PARAM:
TEST_ASSERT_MESSAGE(false, "Frequency and initial phase must be in a range from -1 to 1");
break;
default:
TEST_ASSERT_MESSAGE(false, "Unspecified error");
break;
}
}
}
TEST_CASE("cplx_gen_functionality_test", "[dsps]")
{
const int32_t out_len = 4096;
const int32_t lut_len = 1024;
const float frequency = 0.001;
const float init_phase = 0.1;
cplx_sig_t cplx_signal, cplx_signal_compare;
// F32 float
esp_err_t status1 = dsps_cplx_gen_init(&cplx_signal, F32_FLOAT, NULL, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal, status1);
esp_err_t status2 = dsps_cplx_gen_init(&cplx_signal_compare, F32_FLOAT, cplx_signal.lut, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal_compare, status2);
float *out_array_float = (float *)malloc(out_len * 2 * sizeof(float)); // times 2 for real and complex part
float *out_array_compare_float = (float *)malloc(out_len * 2 * sizeof(float));
dsps_cplx_gen_ansi(&cplx_signal_compare, (void *)out_array_compare_float, out_len);
dsps_cplx_gen(&cplx_signal, (void *)out_array_float, out_len);
for (int i = 0; i < out_len * 2; i++) {
TEST_ASSERT_EQUAL(out_array_compare_float[i], out_array_float[i]);
}
free(out_array_float);
free(out_array_compare_float);
cplx_gen_free(&cplx_signal);
cplx_gen_free(&cplx_signal_compare);
// S16 fixed
status1 = dsps_cplx_gen_init(&cplx_signal, S16_FIXED, NULL, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal, status1);
status2 = dsps_cplx_gen_init(&cplx_signal_compare, S16_FIXED, cplx_signal.lut, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal_compare, status2);
int16_t *out_array_fixed = (int16_t *)malloc(out_len * 2 * sizeof(int16_t)); // times 2 for real and complex part
int16_t *out_array_compare_fixed = (int16_t *)malloc(out_len * 2 * sizeof(int16_t));
dsps_cplx_gen_ansi(&cplx_signal_compare, (void *)out_array_compare_fixed, out_len);
dsps_cplx_gen(&cplx_signal, (void *)out_array_fixed, out_len);
for (int i = 0; i < out_len * 2; i++) {
TEST_ASSERT_EQUAL(out_array_compare_fixed[i], out_array_fixed[i]);
}
free(out_array_fixed);
free(out_array_compare_fixed);
cplx_gen_free(&cplx_signal);
cplx_gen_free(&cplx_signal_compare);
}
TEST_CASE("cplx_gen_benchmark_test", "[dsps]")
{
int32_t out_len = 32;
const int32_t lut_len = 256;
const float frequency = 0.02;
const float init_phase = 0.9;
const int repeat_count = 4;
cplx_sig_t cplx_signal_float, cplx_signal_fixed;
esp_err_t status1 = dsps_cplx_gen_init(&cplx_signal_float, F32_FLOAT, NULL, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal_float, status1);
esp_err_t status2 = dsps_cplx_gen_init(&cplx_signal_fixed, S16_FIXED, NULL, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal_fixed, status2);
float *out_array_float = (float *)malloc(out_len * 2 * 32 * sizeof(float)); // 8192 (max_out len) * 2 (real and imaginary)
int16_t *out_array_fixed = (int16_t *)malloc(out_len * 2 * 32 * sizeof(int16_t));
for (int i = 0; i < 6; i++) {
const unsigned int start_float = dsp_get_cpu_cycle_count();
for (int j = 0 ; j < repeat_count ; j++) {
dsps_cplx_gen(&cplx_signal_float, (void *)out_array_float, out_len);
}
const unsigned int end_float = dsp_get_cpu_cycle_count();
const unsigned int start_fixed = dsp_get_cpu_cycle_count();
for (int j = 0 ; j < repeat_count ; j++) {
dsps_cplx_gen(&cplx_signal_fixed, (void *)out_array_fixed, out_len);
}
const unsigned int end_fixed = dsp_get_cpu_cycle_count();
const float total_float = end_float - start_float;
const float total_fixed = end_fixed - start_fixed;
const float cycles_float = total_float / (float)(repeat_count);
const float cycles_fixed = total_fixed / (float)(repeat_count);
const float cycles_per_lut_sample_float = total_float / (float)(out_len * repeat_count);
const float cycles_per_lut_sample_fixed = total_fixed / (float)(out_len * repeat_count);
ESP_LOGI(TAG, "Float : %.2f total cycles, %.2f cycles per sample, for %"PRId32" LUT samples, %"PRId32" output array length",
cycles_float, cycles_per_lut_sample_float, lut_len, out_len);
ESP_LOGI(TAG, "Fixed : %.2f total cycles, %.2f cycles per sample, for %"PRId32" LUT samples, %"PRId32" output array length \n",
cycles_fixed, cycles_per_lut_sample_fixed, lut_len, out_len);
out_len *= 2;
}
free(out_array_fixed);
free(out_array_float);
cplx_gen_free(&cplx_signal_float);
cplx_gen_free(&cplx_signal_fixed);
}
TEST_CASE("cplx_gen_noise_SNR_test", "[dsps]")
{
const int32_t out_len = 2048;
const int32_t lut_len = 8192;
const int32_t n_fft = out_len * 2; // * 2 (real and imaginary)
const float frequency = 0.01;
const float init_phase = 0.0;
const float real_ampl = 0.5;
const float imag_ampl = 0.2;
cplx_sig_t cplx_signal_float;
esp_err_t status = dsps_cplx_gen_init(&cplx_signal_float, F32_FLOAT, NULL, lut_len, frequency, init_phase);
error_msg_handler(&cplx_signal_float, status);
float *out_array_float = (float *)memalign(16, n_fft * sizeof(float));
dsps_cplx_gen(&cplx_signal_float, (void *)out_array_float, out_len);
// Signal windowing
float *window = (float *)memalign(16, out_len * sizeof(float));
dsps_wind_blackman_harris_f32(window, out_len);
for (int i = 0 ; i < out_len ; i++) {
out_array_float[i * 2 + 0] *= (window[i] * real_ampl);
out_array_float[i * 2 + 1] *= (window[i] * imag_ampl);
}
free(window);
// Initialize FFT
esp_err_t ret = dsps_fft2r_init_fc32(NULL, n_fft);
if (ret != ESP_OK) {
ESP_LOGE(TAG, "Not possible to initialize FFT. Error = %i", ret);
return;
}
// Do the FFT
dsps_fft2r_fc32(out_array_float, out_len);
dsps_bit_rev_fc32(out_array_float, out_len);
dsps_cplx2reC_fc32(out_array_float, out_len);
// Convert the FFT spectrum from amplitude to watts, find the max value and its position
float max_val_1 = -1000000, max_val_2 = -1000000;
int max_pos_1 = 0, max_pos_2 = 0, spur_pos_1 = 0, spur_pos_2 = 0;
for (int i = 0 ; i < n_fft / 2 ; i++) {
out_array_float[i] = (out_array_float[i * 2 + 0] * out_array_float[i * 2 + 0] + out_array_float[i * 2 + 1] * out_array_float[i * 2 + 1]) / (n_fft * 3);
if (i < n_fft / 4) {
if (out_array_float[i] > max_val_1) {
max_val_1 = out_array_float[i];
max_pos_1 = i;
}
} else {
if (out_array_float[i] > max_val_2) {
max_val_2 = out_array_float[i];
max_pos_2 = i;
}
}
}
// Calculate the power of the signal and noise of the spectrum and convert the spectrum to dB
float signal_pow_1 = 0, signal_pow_2 = 0, noise_pow_1 = 0, noise_pow_2 = 0;
float spur_1 = -1000000, spur_2 = -1000000;
for (int i = 0 ; i < n_fft / 2 ; i++) {
if (i < n_fft / 4) {
if ((i >= max_pos_1 - LEAKAGE_BINS) && (i <= max_pos_1 + LEAKAGE_BINS)) {
signal_pow_1 += out_array_float[i];
} else {
noise_pow_1 += out_array_float[i];
if (out_array_float[i] > spur_1) {
spur_1 = out_array_float[i];
spur_pos_1 = i;
}
}
} else {
if ((i >= max_pos_2 - LEAKAGE_BINS) && (i <= max_pos_2 + LEAKAGE_BINS)) {
signal_pow_2 += out_array_float[i];
} else {
noise_pow_2 += out_array_float[i];
if (out_array_float[i] > spur_2) {
spur_2 = out_array_float[i];
spur_pos_2 = i;
}
}
}
out_array_float[i] = 10 * log10f(0.0000000000001 + out_array_float[i]);
}
// Convert the signal power and noise power from watts to dB and calculate SNR and SFDR
const float snr_1 = 10 * log10f(signal_pow_1 / noise_pow_1);
const float snr_2 = 10 * log10f(signal_pow_2 / noise_pow_2);
noise_pow_1 = 10 * log10f(noise_pow_1);
noise_pow_2 = 10 * log10f(noise_pow_2);
signal_pow_1 = 10 * log10f(signal_pow_1);
signal_pow_2 = 10 * log10f(signal_pow_2);
const float sfdr_1 = out_array_float[max_pos_1] - out_array_float[spur_pos_1];
const float sfdr_2 = out_array_float[max_pos_2] - out_array_float[spur_pos_2];
ESP_LOGI(TAG, "\nSignal Power: \t%f\nNoise Power: \t%f\nSNR: \t\t%f \nSFDR: \t\t%f", signal_pow_1, noise_pow_1, snr_1, sfdr_1);
dsps_view(out_array_float, n_fft / 4, 64, 16, -140, 40, '|');
putchar('\n');
ESP_LOGI(TAG, "\nSignal Power: \t%f\nNoise Power: \t%f\nSNR: \t\t%f \nSFDR: \t\t%f", signal_pow_2, noise_pow_2, snr_2, sfdr_2);
dsps_view(out_array_float + (n_fft / 4), n_fft / 4, 64, 16, -140, 40, '|');
free(out_array_float);
cplx_gen_free(&cplx_signal_float);
}

View File

@@ -0,0 +1,187 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef _dsps_cplx_gen_H_
#define _dsps_cplx_gen_H_
#include "dsp_err.h"
#include "dsps_cplx_gen_platform.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief Ennum defining output data type of the complex generator
*
*/
typedef enum output_data_type {
S16_FIXED = 0, /*!< Q15 fixed point - int16_t*/
F32_FLOAT = 1, /*!< Single precision floating point - float*/
} out_d_type;
/**
* @brief Data struct of the complex signal generator
*
* This structure is used by a complex generator internally. A user should access this structure only in case of
* extensions for the DSP Library.
* All the fields of this structure are initialized by the dsps_cplx_gen_init(...) function.
*/
typedef struct cplx_sig_s {
void *lut; /*!< Pointer to the lookup table.*/
int32_t lut_len; /*!< Length of the lookup table.*/
float freq; /*!< Frequency of the output signal. Nyquist frequency -1 ... 1*/
float phase; /*!< Phase (initial_phase during init)*/
out_d_type d_type; /*!< Output data type*/
int16_t free_status; /*!< Indicator for cplx_gen_free(...) function*/
} cplx_sig_t;
/**
* @brief Initialize strucure for complex generator
*
* Function initializes a structure for either 16-bit fixed point, or 32-bit floating point complex generator using LUT table.
* cplx_gen_free(...) must be called, once the generator is not needed anymore to free dynamically allocated memory
*
* A user can specify his own LUT table and pass a pointer to the table (void *lut) during the initialization. If the LUT table
* pointer passed to the init function is a NULL, the LUT table is initialized internally.
*
* @param cplx_gen: pointer to the floating point generator structure
* @param d_type: output data type - out_d_type enum
* @param lut: pointer to a user-defined LUT, the data type is void so both (S16_FIXED, F32_FLOAT) types could be used
* @param lut_len: length of the LUT
* @param freq: Frequency of the output signal in a range of [-1...1], where 1 is a Nyquist frequency
* @param initial_phase: initial phase of the complex signal in range of [-1..1] where 1 is related to 2Pi and -1 is related to -2Pi
*
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_cplx_gen_init(cplx_sig_t *cplx_gen, out_d_type d_type, void *lut, int32_t lut_len, float freq, float initial_phase);
/**
* @brief function sets the output frequency of the complex generator
*
* set function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
*
* @param cplx_gen: pointer to the complex signal generator structure
* @param freq: new frequency to be set in a range of [-1..1] where 1 is a Nyquist frequency
*
* @return
* - ESP_OK on success
* - ESP_ERR_DSP_INVALID_PARAM if the frequency is out of the Nyquist frequency range
*/
esp_err_t dsps_cplx_gen_freq_set(cplx_sig_t *cplx_gen, float freq);
/**
* @brief function gets the output frequency of the complex generator
*
* get function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
*
* @param cplx_gen: pointer to the complex signal generator structure
*
* @return function returns frequency of the signal generator
*/
float dsps_cplx_gen_freq_get(cplx_sig_t *cplx_gen);
/**
* @brief function sets the phase of the complex generator
*
* set function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
*
* @param cplx_gen: pointer to the complex signal generator structure
* @param phase: new phase to be set in the range of [-1..1] where 1 is related to 2Pi and -1 is related to -2Pi
*
* @return
* - ESP_OK on success
* - ESP_ERR_DSP_INVALID_PARAM if the phase is out of -1 ... 1 range
*/
esp_err_t dsps_cplx_gen_phase_set(cplx_sig_t *cplx_gen, float phase);
/**
* @brief function gets the phase of the complex generator
*
* get function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
*
* @param cplx_gen: pointer to the complex signal generator structure
*
* @return function returns phase of the signal generator
*/
float dsps_cplx_gen_phase_get(cplx_sig_t *cplx_gen);
/**
* @brief function sets the output frequency and the phase of the complex generator
*
* set function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
*
* @param cplx_gen: pointer to the complex signal generator structure
* @param freq: new frequency to be set in the range of [-1..1] where 1 is a Nyquist frequency
* @param phase: new phase to be set in the range of [-1..1] where 1 is related to 2Pi and -1 is related to -2Pi
*
* @return
* - ESP_OK on success
* - ESP_ERR_DSP_INVALID_PARAM if the frequency is out of the Nyquist frequency range
* if the phase is out of -1 ... 1 range
*/
esp_err_t dsps_cplx_gen_set(cplx_sig_t *cplx_gen, float freq, float phase);
/**
* @brief function frees dynamically allocated memory, which was allocated in the init function
*
* free function must be called after the dsps_cplx_gen_init(...) is called, once the complex generator is not
* needed anymore
*
* @param cplx_gen: pointer to the complex signal generator structure
*/
void cplx_gen_free(cplx_sig_t *cplx_gen);
/**
* @brief The function generates a complex signal
*
* the generated complex signal is in the form of two harmonics signals in either 16-bit signed fixed point
* or 32-bit floating point
*
* x[i]= A*sin(step*i + ph/180*Pi)
* x[i+1]= B*cos(step*i + ph/180*Pi)
* where step = 2*Pi*frequency
*
* dsps_cplx_gen_ansi() - The implementation uses ANSI C and could be compiled and run on any platform
* dsps_cplx_gen_ae32() - Is targetted for Xtensa cores
*
* @param cplx_gen: pointer to the generator structure
* @param output: output array (length of len*2), data type is void so both (S16_FIXED, F32_FLOAT) types could be used
* @param len: length of the output signal
*
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_cplx_gen_ansi(cplx_sig_t *cplx_gen, void *output, int32_t len);
esp_err_t dsps_cplx_gen_ae32(cplx_sig_t *cplx_gen, void *output, int32_t len);
#ifdef __cplusplus
}
#endif
#if (dsps_cplx_gen_ae32_enbled || dsps_cplx_gen_aes3_enbled)
#define dsps_cplx_gen dsps_cplx_gen_ae32
#else // CONFIG_DSP_OPTIMIZED
#define dsps_cplx_gen dsps_cplx_gen_ansi
#endif // CONFIG_DSP_OPTIMIZED
#endif // _dsps_cplx_gen_H_

View File

@@ -0,0 +1,30 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef _dsps_cplx_gen_platform_H_
#define _dsps_cplx_gen_platform_H_
#include "sdkconfig.h"
#ifdef __XTENSA__
#include <xtensa/config/core-isa.h>
#include <xtensa/config/core-matmap.h>
#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
#if CONFIG_IDF_TARGET_ESP32S3
#define dsps_cplx_gen_aes3_enbled 1
#define dsps_cplx_gen_ae32_enbled 0
#elif CONFIG_IDF_TARGET_ESP32
#define dsps_cplx_gen_ae32_enbled 1
#define dsps_cplx_gen_aes3_enbled 0
#endif // CONFIG_IDF_TARGET_ESP32S3 CONFIG_IDF_TARGET_ESP32
#endif //
#endif // __XTENSA__
#endif // _dsps_cplx_gen_platform_H_

View File

@@ -0,0 +1,47 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _dsps_d_gen_H_
#define _dsps_d_gen_H_
#include "dsp_err.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief delta function
*
* The function generate delta function.
* output[i]=0, if i=[0..N)
* output[i]=1, if i=pos, pos: [0..N-1)
* The implementation use ANSI C and could be compiled and run on any platform
*
* @param output: output array.
* @param len: length of the input signal
* @param pos: delta function position
*
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_d_gen_f32(float *output, int len, int pos);
#ifdef __cplusplus
}
#endif
#endif // _dsps_d_gen_H_

View File

@@ -0,0 +1,48 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _dsps_h_gen_H_
#define _dsps_h_gen_H_
#include "dsp_err.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief Heviside function
*
* The Heviside function.
* output[i]=0, if i=[0..pos)
* output[i]=1, if i=[pos..N)
* The implementation use ANSI C and could be compiled and run on any platform
*
* @param output: output array.
* @param len: length of the input signal
* @param pos: heviside function position
*
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_h_gen_f32(float *output, int len, int pos);
#ifdef __cplusplus
}
#endif
#endif // _dsps_h_gen_H_

View File

@@ -0,0 +1,51 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _dsps_sfdr_H_
#define _dsps_sfdr_H_
#include "dsp_err.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief SFDR
*
* The function calculates Spurious-Free Dynamic Range.
* The function makes FFT of the input, then search a spectrum maximum, and then compare
* maximum value with all others. Result calculated as minimum value.
* This function have to be used for debug and unit tests only. It's not optimized for real-time processing.
* The implementation use ANSI C and could be compiled and run on any platform
*
* @param[in] input: input array.
* @param len: length of the input signal
* @param use_dc: this parameter define will be DC value used for calculation or not.
* 0 - SNR will not include DC power
* 1 - SNR will include DC power
*
* @return
* - SFDR in DB
*/
float dsps_sfdr_f32(const float *input, int32_t len, int8_t use_dc);
float dsps_sfdr_fc32(const float *input, int32_t len);
#ifdef __cplusplus
}
#endif
#endif // _dsps_sfdr_H_

View File

@@ -0,0 +1,51 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _DSP_SNR_H_
#define _DSP_SNR_H_
#include "dsp_err.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief SNR
*
* The function calculates signal to noise ration in case if signal is sine tone.
* The function makes FFT of the input, then search a spectrum maximum, and then calculated
* SNR as sum of all harmonics to the maximum value.
* This function have to be used for debug and unit tests only. It's not optimized for real-time processing.
* The implementation use ANSI C and could be compiled and run on any platform
*
* @param input: input array.
* @param len: length of the input signal
* @param use_dc: this parameter define will be DC value used for calculation or not.
* 0 - SNR will not include DC power
* 1 - SNR will include DC power
*
* @return
* - SNR in dB
*/
float dsps_snr_f32(const float *input, int32_t len, uint8_t use_dc);
float dsps_snr_fc32(const float *input, int32_t len);
#ifdef __cplusplus
}
#endif
#endif // _DSP_SNR_H_

View File

@@ -0,0 +1,48 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _dsps_tone_gen_H_
#define _dsps_tone_gen_H_
#include "dsp_err.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @brief tone
*
* The function generate a tone signal.
* x[i]=A*sin(2*PI*i + ph/180*PI)
* The implementation use ANSI C and could be compiled and run on any platform
*
* @param output: output array.
* @param len: length of the input signal
* @param Ampl: amplitude
* @param freq: Naiquist frequency -1..1
* @param phase: phase in degree
*
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_tone_gen_f32(float *output, int len, float Ampl, float freq, float phase);
#ifdef __cplusplus
}
#endif
#endif // _dsps_tone_gen_H_

View File

@@ -0,0 +1,64 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _dsps_view_H_
#define _dsps_view_H_
#include "dsp_err.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**@{*/
/**
* @brief plot view
*
* Generic view function.
* This function takes input samples and show then in console view as a plot.
* The main purpose to give and draft debug information to the DSP developer.
*
* @param[in] data: array with input samples.
* @param len: length of the input array
* @param width: plot width in symbols
* @param height: plot height in lines
* @param min: minimum value that will be limited by Axis Y.
* @param max: maximum value that will be limited by Axis Y.
* @param view_char: character to draw the plot calues ('.' or '|' etc)
*
*/
void dsps_view(const float *data, int32_t len, int width, int height, float min, float max, char view_char);
void dsps_view_s16(const int16_t *data, int32_t len, int width, int height, float min, float max, char view_char);
/**@}*/
/**
* @brief spectrum view
*
* The view function to show spectrum values in 64x10 screen.
* The function based on dsps_view.
*
* @param[in] data: array with input samples.
* @param len: length of the input array
* @param min: minimum value that will be limited by Axis Y.
* @param max: maximum value that will be limited by Axis Y.
*
*/
void dsps_view_spectrum(const float *data, int32_t len, float min, float max);
#ifdef __cplusplus
}
#endif
#endif // _dsps_view_H_

View File

@@ -0,0 +1,340 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_mem_platform.h"
#if dsps_mem_aes3_enbled
// This is memory access for ESP32S3 processor.
.text
.align 4
.global dsps_memcpy_aes3
.type dsps_memcpy_aes3,@function
// The function implements the following C code:
// void *dsps_memcpy_aes3(void *arr_dest, const void *arr_src, size_t arr_len);
// Input params Variables
//
// arr_dest - a2 loop_len - a5, a6
// arr_src - a3 p_arr_des - a7
// arr_len - a4 div_48 - a8
// align_mask - a9
/*
esp32s3 optimized memcpy function works with both, aligned and unaligned data.
arr_dest aligned --> - _main_loop_aligned, 32 bytes in one run through the cycle, only aligned data
arr_src aligned / - Check modulos to finish copying the remaining data outside of the cycle
- Modulo 8 and 16 - S3 instructions for aligned data, the rest of the modulos are generic
arr_dest aligned ---> - _main_loop_unaligned, 48 bytes of source unaligned data in one run through the cycle,
arr_src unaligned / (the destination must always be aligned)
- Check modulos to finish copying remaining data outside of the cycle
- Modulo 32 and 16 - S3 instructions for unaligned data, the rest of the modulos are generic
arr_dest unaligned -> - First, use generic instructions to align the arr_dest data (keep increasing
arr_src aligned / the arr_dest pointer until the pointer is aligned)
- Once arr_dest is aligned treat the rest of the data as:
either both aligned (if arr_src happens to be aligned after the arr_dest aligning),
or as arr_dest aligned and arr_src unaligned
- Continue as mentioned above
arr_dest unaligned -> - Very same approach as with arr_dest unaligned and arr_src aligned
arr_src unaligned /
if the arr_len is less than 16, jump to _less_than_16 label and copy data without any s3 instructions or cycles
*/
#define MEMCPY_OPTIMIZED 1 // Use optimized memcpy or ANSI memcpy
#define TIE_ENABLE 0 // Put a dummy TIE instruction to the ANSI memcpy to induce TIE context saving
dsps_memcpy_aes3:
#if MEMCPY_OPTIMIZED
// S3 optimized version of the memcpy (with TIE instrucstions)
entry a1, 32
mov a7, a2 // a7 - save arr_dest pointer
blti a4, 16, _less_than_16
// arr_dest alignment check
movi.n a9, 0xf // 0xf alignment mask
and a13, a9, a2 // 0xf AND arr_dest pointer
beqz a13, _arr_dest_aligned
movi.n a14, 16 // a14 - 16
sub a13, a14, a13 // a13 = 16 - unalignment
sub a4, a4, a13 // len = len - (16 - unalignment)
// Aligning the arr_dest
// keep copying until arr_dest is aligned
// Check modulo 8 of the unalignment, if - then copy 8 bytes
bbci a13, 3, _aligning_mod_8_check // branch if 3-rd bit of unalignment a13 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15, offset 0
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14, offset 4
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2, offset 0
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2, offset 4
addi.n a3, a3, 8 // increment arr_src pointer by 8 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
_aligning_mod_8_check:
// Check modulo 4 of the unalignment, if - then copy 4 bytes
bbci a13, 2, _aligning_mod_4_check // branch if 2-nd bit of unalignment a13 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_aligning_mod_4_check:
// Check modulo 2 of the unalignment, if - then copy 2 bytes
bbci a13, 1, _aligning_mod_2_check // branch if 1-st bit of unalignment a13 is clear
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_aligning_mod_2_check:
// Check modulo 1 of the unalignment, if - then copy 1 byte
bbci a13, 0, _arr_dest_aligned // branch if 0-th bit of unalignment a13 is clear
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
addi.n a3, a3, 1 // increment arr_src pointer by 1 byte
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
addi.n a2, a2, 1 // increment arr_dest pointer by 1 byte
_arr_dest_aligned:
// arr_src alignment check
and a15, a9, a3 // 0xf (alignment mask) AND arr_src pointer
beqz a15, _arr_src_aligned
// arr_src unaligned, arr_dest aligned (arr_des either aligned originally or modified to be aligned by the Aligning the arr_des routine)
// Calculate modulo for non-aligned data
movi a8, 89478486 // a8 - div_48 constant
muluh a5, a8, a4 // a5 - loop_len = arr_len / 48
movi a9, 48 // a9 - 48
mul16s a8, a9, a5 // a8 - 48 * loop_len
sub a6, a4, a8 // a6 - loop_len_MOD 48
ee.ld.128.usar.ip q2, a3, 16 // Preload from arr_src
ee.ld.128.usar.ip q3, a3, 16 // Preload from arr_src
// Main loop arr_src unaligned
loopnez a5, ._main_loop_unaligned // 48 bytes in one loop
ee.src.q.ld.ip q4, a3, 16, q2, q3 // preload and shift from arr_src
ee.vst.128.ip q2, a2, 16 // store to aligned arr_dest
ee.src.q.ld.ip q2, a3, 16, q3, q4 // preload and shift from arr_src
ee.vst.128.ip q3, a2, 16 // store to aligned arr_dest
ee.src.q.ld.ip q3, a3, 16, q4, q2 // preload and shift from arr_src
ee.vst.128.ip q4, a2, 16 // store to aligned arr_dest
._main_loop_unaligned:
// Finish the _main_loop_unaligned outside of the loop from Q registers preloads
// Check modulo 32 of the loop_len_MOD, if - then copy 32 bytes
bbci a6, 5, _unaligned_mod_32_check // branch if 5-th bit of loop_len_MOD a6 is clear
ee.src.q.ld.ip q4, a3, 0, q2, q3 // preload and shift from arr_src
ee.vst.128.ip q2, a2, 16 // store to aligned arr_dest
ee.src.q q3, q3, q4 // final shift
ee.vst.128.ip q3, a2, 16 // store to aligned arr_dest
j _follow_unaligned
_unaligned_mod_32_check:
// Check modulo 16 of the loop_len_MOD, if - then copy 16 bytes
bbci a6, 4, _unaligned_mod_16_check // branch if 4-th bit of loop_len_MOD a6 is clear
ee.src.q q2, q2, q3 // final shift
ee.vst.128.ip q2, a2, 16 // store to aligned arr_dest
addi a3, a3, -16 // put arr_src pointer back
j _follow_unaligned
_unaligned_mod_16_check:
addi a3, a3, -32 // put arr_src pointer back
// Finish the _main_loop_unaligned outside of the loop
// Check modulo 8 of the loop_len_MOD, if - then copy 8 bytes
_follow_unaligned:
bbci a6, 3, _unaligned_mod_8_check // branch if 3-rd bit of loop_len_MOD a6 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15, offset 0
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14, offset 4
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2, offset 0
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2, offset 4
addi.n a3, a3, 8 // increment arr_src pointer by 8 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
_unaligned_mod_8_check:
// Finish the rest of the data, as if the data were aligned, no S3 instructions will be used further after the jump
j _aligned_mod_8_check
// Both arrays (arr_src and arr_dest) aligned
_arr_src_aligned:
// Calculate modulo 32 for aligned data
srli a5, a4, 5 // a5 - loop_len = arr_len / 32
slli a6, a5, 5
sub a6, a4, a6 // a6 - loop_len_MOD 32
// Main loop arr_src aligned
loopnez a5, ._main_loop_aligned // 32 bytes in one loop
ee.vld.128.ip q0, a3, 16 // load 16 bytes from arr_src to q0
ee.vld.128.ip q1, a3, 16 // load 16 bytes from arr_src to q1
ee.vst.128.ip q0, a2, 16 // save 16 bytes to arr_dest from q0
ee.vst.128.ip q1, a2, 16 // save 16 bytes to arr_dest from q1
._main_loop_aligned:
// Modulo 32 check
beqz a6, _aligned_mod_32_check // branch if mod_32 = 0
// finish the end of the array outside of the main loop
// Check modulo 16 of the loop_len_MOD, if - then copy 16 bytes
bbci a6, 4, _aligned_mod_16_check // branch if 4-th bit of loop_len_MOD a6 is clear
ee.vld.128.ip q0, a3, 16 // load 128 bits from arr_src to q0, increase arr_src pointer by 16 bytes
ee.vst.128.ip q0, a2, 16 // save 128 bits to arr_dest from q0, increase arr_dest pointer by 16 bytes
_aligned_mod_16_check:
// Check modulo 8 of the loop_len_MOD, if - then copy 8 bytes
bbci a6, 3, _aligned_mod_8_check // branch if 3-rd bit of loop_len_MOD a6 is clear
ee.vld.l.64.ip q0, a3, 8 // load lower 64 bits from arr_src a3 to q0, increase arr_src pointer by 8 bytes
ee.vst.l.64.ip q0, a2, 8 // save lower 64 bits from q0 to arr_dest a2, increase arr_dest pointer by 8 bytes
_aligned_mod_8_check:
// Check modulo 4 of the loop_len_MOD, if - then copy 4 bytes
bbci a6, 2, _aligned_mod_4_check // branch if 2-nd bit of loop_len_MOD a6 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_aligned_mod_4_check:
// Check modulo 2 of the loop_len_MOD, if - then copy 2 bytes
bbci a6, 1, _aligned_mod_2_check // branch if 1-st bit of loop_len_MOD a6 is clear
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_aligned_mod_2_check:
// Check modulo 1 of the loop_len_MOD, if - then copy 1 byte
bbci a6, 0, _aligned_mod_32_check // branch if 0-th bit of loop_len_MOD a6 is clear
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
_aligned_mod_32_check:
mov a2, a7 // copy the initial arr_dest pointer from a7 to arr_dest a2
retw.n // return
_less_than_16:
// If the length of the copied array is lower than 16, it is faster not to use esp32s3-optimized functions
// Check modulo 8 of the arr_len, if - then copy 8 bytes
bbci a4, 3, _less_than_16_mod_8_check // branch if 3-rd bit of arr_len a4 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15, offset 0
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14, offset 4
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2, offset 0
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2, offset 4
addi.n a3, a3, 8 // increment arr_src pointer by 8 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
_less_than_16_mod_8_check:
// Check modulo 4 of the arr_len, if - then copy 4 bytes
bbci a4, 2, _less_than_16_mod_4_check // branch if 2-nd bit of arr_len a4 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_less_than_16_mod_4_check:
// Check modulo 2 of the arr_len, if - then copy 2 bytes
bbci a4, 1, _less_than_16_mod_2_check // branch if 1-st bit of arr_len a4 is clear
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_less_than_16_mod_2_check:
// Check modulo 1 of the arr_len, if - then copy 1 byte
bbci a4, 0, _less_than_16_mod_1_check // branch if 0-th bit of arr_len a4 is clear
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
_less_than_16_mod_1_check:
mov a2, a7 // copy the initial arr_dest pointer from a7 to arr_dest a2
retw.n // return
#else // MEMCPY_OPTIMIZED
// ansi version of the memcpy (without TIE instructions) for testing purposes
entry a1, 32
mov a7, a2 // a7 - save arr_dest pointer
srli a5, a4, 4 // a5 - loop_len = arr_len / 16
// Run main loop which copies 16 bytes in one loop run
loopnez a5, ._ansi_loop
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14
l32i.n a13, a3, 8 // load 32 bits from arr_src a3 to a13
l32i.n a12, a3, 12 // load 32 bits from arr_src a3 to a13
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2
s32i.n a13, a2, 8 // save 32 bits from a13 to arr_dest a2
s32i.n a12, a2, 12 // save 32 bits from a13 to arr_dest a2
addi.n a3, a3, 16 // increment arr_src pointer by 12 bytes
addi.n a2, a2, 16 // increment arr_dest pointer by 12 bytes
._ansi_loop:
// Finish the remaining bytes out of the loop
// Check modulo 8 of the arr_len, if - then copy 8 bytes
bbci a4, 3, _mod_8_check // branch if 2-nd bit of arr_len a4 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a15
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
s32i.n a14, a2, 4 // save 32 bits from a15 to arr_dest a2
addi.n a3, a3, 8 // increment arr_src pointer by 4 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 4 bytes
_mod_8_check:
// Check modulo 4 of the arr_len, if - then copy 4 bytes
bbci a4, 2, _mod_4_check // branch if 2-nd bit of arr_len a4 is clear
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_mod_4_check:
// Check modulo 2 of the arr_len, if - then copy 2 bytes
bbci a4, 1, _mod_2_check // branch if 1-st bit of arr_len a4 is clear
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_mod_2_check:
// Check modulo 1 of the arr_len, if - then copy 1 byte
bbci a4, 0, _mod_1_check // branch if 0-th bit of arr_len a4 is clear
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
_mod_1_check:
// if arr_len is shorter than 16, skip adding TIE instruction, to fix the panic handler before the main_app() loads
blti a4, 16, _less_than_16_1 // branch, if arr_len a4 is shorter than 16 bytes
#if TIE_ENABLE // put dummy TIE instruction to induce TIE context saving
ee.zero.qacc // initialize q0 to zero (dummy instruction)
#else // TIE_ENABLE
nop // compensate one cycle, when TIE is disabled to get the same benchmark value
#endif // TIE_ENABLE
_less_than_16_1:
mov a2, a7 // copy the initial arr_dest pointer from a7 to arr_dest a2
retw.n // return
#endif // MEMCPY_OPTIMIZED
#endif // dsps_mem_aes3_enbled

View File

@@ -0,0 +1,248 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dsps_mem_platform.h"
#if dsps_mem_aes3_enbled
// This is memory access for ESP32S3 processor.
.text
.align 4
.global dsps_memset_aes3
.type dsps_memset_aes3,@function
// The function implements the following C code:
// void *dsps_memset_aes3(void *arr_dest, uint8_t set_val, size_t set_size);
// Input params Variables
//
// arr_dest - a2 loop_len - a5
// set_val - a3 p_arr_dest - a8
// set_size - a4 8_bit_set - a7
// 16_bit_set - a9
// 32_bit_set - a10
// align_mask - a11
/*
esp32s3 optimized memset function works with both, aligned and unaligned data.
arr_dest aligned - _main_loop, 16 bytes in one loop, only aligned data
- Check modulos to finish copying remaining data outside of the cycle
- Modulo 8 - S3 instruction for aligned data, the rest of the modulos are generic
arr_dest unaligned - First, use generic instructions to align the arr_dest data (keep increasing
the arr_dest pointer until the pointer is aligned)
- Once arr_dest is aligned treat the rest of the data as aligned, same as above
if the set_size is less than 16, jump to _less_than_16 label and set data without any s3 instructions or cycles
*/
#define MEMSET_OPTIMIZED 1 // Use optimized memset or ansi memset
#define TIE_ENABLE 0 // Put a dummy TIE instruction to ANSI memset to induce TIE context saving
dsps_memset_aes3:
#if MEMSET_OPTIMIZED
entry a1, 32
mov a8, a2 // a8 - save arr_dest pointer
blti a4, 16, _less_than_16 // set_size shorter than 16
movi.n a7, 0xff // 0xff one-byte mask
movi.n a11, 0xf // 0xf alignment mask
and a7, a7, a3 // mask upper 24 bits of set_val a3
bnez.n a7, _non_zero_constant
ee.zero.q q0 // initialize q0 to zero
movi.n a9, 0 // initialize (16_bit_set) a9 to zero
movi.n a10, 0 // initialize (32_bit_set) a10 to zero
j _q_reg_prepared
_non_zero_constant:
// Fill q register
slli a6, a7, 8 // a6 - (masked)set_val << 8
or a9, a6, a7 // a9 - (masked)set_val << 8 + (masked)set_val
// a9 - 16-bit set
slli a15, a9, 16 // a15 - a9 << 16
or a10, a9, a15 // broadcast 8 bits from set_val a3 to 32 bits
// a10 - 32-bit set
ee.movi.32.q q0, a10, 0 // fill q0 register from a10 by 32 bits
ee.movi.32.q q0, a10, 1
ee.movi.32.q q0, a10, 2
ee.movi.32.q q0, a10, 3
_q_reg_prepared:
// alignment check
and a15, a11, a2 // 0xf (alignment mask) AND arr_dest pointer
beqz a15, _arr_dest_aligned // branch if a15 equals to zero
movi.n a14, 16 // a14 - 16
sub a15, a14, a15 // a15 = 16 - unalignment
sub a4, a4, a15 // len = len - (16 - unalignment)
// keep setting until arr_dest is aligned
// Check modulo 8 of the unalignment, if - then set 8 bytes
bbci a15, 3, _aligning_mod_8_check // branch if 3-rd bit of unalignment a15 is clear
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
s32i.n a10, a2, 4 // save 32 bits from a10 to arr_dest a2, offset 4 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
_aligning_mod_8_check:
// Check modulo 4 of the unalignment, if - then set 4 bytes
bbci a15, 2, _aligning_mod_4_check // branch if 2-nd bit unalignment a15 is clear
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_aligning_mod_4_check:
// Check modulo 2 of the unalignment, if - then set 2 bytes
bbci a15, 1, _aligning_mod_2_check // branch if 1-st bit unalignment a15 is clear
s16i a9, a2, 0 // save 16 bits from a9 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_aligning_mod_2_check:
// Check modulo 1 of the unalignment, if - then copy 1 byte
bbci a15, 0, _arr_dest_aligned // branch if 0-th bit unalignment a15 is clear
s8i a7, a2, 0 // save 8 bits from a7 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 1 // increment arr_dest pointer by 1 byte
_arr_dest_aligned:
// Calculate main loop_len
srli a5, a4, 4 // a5 - loop_len = set_size / 16
// Main loop
loopnez a5, ._main_loop // 16 bytes in one loop
ee.vst.128.ip q0, a2, 16 // store 16 bytes from q0 to arr_dest a2
._main_loop:
// Check modulo 8 of the set_size, if - then set 8 bytes
bbci a4, 3, _aligned_mod_8_check // branch if 3-rd bit of set_size a4 is clear
ee.vst.l.64.ip q0, a2, 8 // save lower 64 bits from q0 to arr_dest a2, increase arr_dest pointer by 8 bytes
_aligned_mod_8_check:
// Check modulo 4 of the set_size, if - then set 4 bytes
bbci a4, 2, _aligned_mod_4_check // branch if 2-nd bit of set_size a4 is clear
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_aligned_mod_4_check:
// Check modulo 2 of the set_size, if - then set 2 bytes
bbci a4, 1, _aligned_mod_2_check // branch if 1-st bit of set_size a4 is clear
s16i a9, a2, 0 // save 16 bits from a9 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_aligned_mod_2_check:
// Check modulo 1 of the set_size, if - then set 1 byte
bbci a4, 0, _aligned_mod_1_check // branch if 0-th bit of set_size a4 is clear
s8i a7, a2, 0 // save 8 bits from a7 to arr_dest a2, offset 0 bytes
_aligned_mod_1_check:
mov a2, a8 // copy the initial arr_dest pointer from a8 to arr_dest a2
retw.n // return
_less_than_16:
// make 16-byte set_val
slli a6, a3, 8 // a6 - a3 (set_val) << 8
or a7, a6, a3 // a7 - a3 (set_val) << 8 + a3 (set_val)
// Check modulo 8 of the set_size, if - then set 8 bytes
bbci a4, 3, _less_than_16_mod_8_check // branch if 3-rd bit of set_size a4 is clear
s16i a7, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
s16i a7, a2, 2 // save 16 bits from a7 to arr_dest a2, offset 2 bytes
s16i a7, a2, 4 // save 16 bits from a7 to arr_dest a2, offset 4 bytes
s16i a7, a2, 6 // save 16 bits from a7 to arr_dest a2, offset 6 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
_less_than_16_mod_8_check:
// Check modulo 4 of the set_size, if - then set 4 bytes
bbci a4, 2, _less_than_16_mod_4_check // branch if 2-nd bit of set_size a4 is clear
s16i a7, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
s16i a7, a2, 2 // save 16 bits from a7 to arr_dest a2, offset 2 bytes
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_less_than_16_mod_4_check:
// Check modulo 2 of the set_size, if - then set 2 bytes
bbci a4, 1, _less_than_16_mod_2_check // branch if 1-st bit of set_size a4 is clear
s16i a7, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_less_than_16_mod_2_check:
// Check modulo 1 of the set_size, if - then set 1 byte
bbci a4, 0, _less_than_16_mod_1_check // branch if 0-th bit of set_size a4 is clear
s8i a3, a2, 0 // save 8 bits from a3 to arr_dest a2, offset 0 bytes
_less_than_16_mod_1_check:
mov a2, a8 // copy the initial arr_dest pointer from a8 to arr_dest a2
retw.n // return
#else // MEMSET_OPTIMIZED
// ansi version of the memset (without TIE instructions) for testing purposes
entry a1, 32
mov a8, a2 // a8 - save arr_dest pointer
movi.n a7, 0xff // 0xff one-byte mask
and a7, a7, a3 // mask upper 24 bits of a3
slli a6, a7, 8 // a6 - (masked)set_val << 8
or a9, a6, a7 // a9 - (masked)set_val << 8 + (masked)set_val
// a9 - 16-bit set
slli a15, a9, 16 // a15 - a9 << 16
or a10, a9, a15 // broadcast 8 bits from a3 to 32 bits
srli a5, a4, 4 // a5 - loop_len = arr_len / 16
// Run main loop which sets 16 bytes in one loop run
loopnez a5, ._ansi_loop
s32i.n a10, a2, 0 // save 32 bits from a15 to arr_dest a2
s32i.n a10, a2, 4 // save 32 bits from a14 to arr_dest a2
s32i.n a10, a2, 8 // save 32 bits from a14 to arr_dest a2
s32i.n a10, a2, 12 // save 32 bits from a14 to arr_dest a2
addi.n a2, a2, 16 // increment arr_dest pointer by 8 bytes
._ansi_loop:
// Finish the remaining bytes out of the loop
// Check modulo 8 of the arr_len, if - then set 8 bytes
bbci a4, 3, _mod_8_check // branch if 2-nd bit of arr_len is clear
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
s32i.n a10, a2, 4 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 8 // increment arr_dest pointer by 4 bytes
_mod_8_check:
// Check modulo 4 of the arr_len, if - then set 4 bytes
bbci a4, 2, _mod_4_check // branch if 2-nd bit of arr_len is clear
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
_mod_4_check:
// Check modulo 2 of the arr_len, if - then set 2 bytes
bbci a4, 1, _mod_2_check // branch if 1-st bit of arr_len is clear
s16i a9, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
_mod_2_check:
// Check modulo 1 of the arr_len, if - then set 1 byte
bbci a4, 0, _mod_1_check // branch if 0-th bit of arr_len is clear
s8i a7, a2, 0 // save 8 bits from a3 to arr_dest a2, offset 0 bytes
_mod_1_check:
// if arr_len is shorter than 16, skip adding TIE instruction, to fix the panic handler before the main_app() loads
blti a4, 16, _less_than_16_1 // set_size shorter than 16, to fix panic handler before main_app() load
#if TIE_ENABLE // put dummy TIE instruction to induce TIE context saving
ee.zero.qacc // initialize q0 to zero
#else // TIE_ENABLE
nop // compensate one cycle, when TIE is disabled to get the same benchmark value
#endif // TIE_ENABLE
_less_than_16_1:
mov a2, a8 // copy the initial arr_dest pointer from a8 to arr_dest a2
retw.n // return
#endif // MEMSET_OPTIMIZED
#endif // dsps_mem_aes3_enbled

View File

@@ -0,0 +1,67 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef _dsps_mem_H_
#define _dsps_mem_H_
#include "dsp_err.h"
#include "dsp_common.h"
#include "dsps_mem_platform.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**@{*/
/**
* @brief memory copy function using esp32s3 TIE
*
* The extension (_aes3) is optimized for esp32S3 chip.
*
* @param arr_dest: pointer to the destination array
* @param arr_src: pointer to the source array
* @param arr_len: count of bytes to be copied from arr_src to arr_dest
*
* @return: pointer to dest array
*/
void *dsps_memcpy_aes3(void *arr_dest, const void *arr_src, size_t arr_len);
/**@{*/
/**
* @brief memory set function using esp32s3 TIE
*
* The extension (_aes3) is optimized for esp32S3 chip.
*
* @param arr_dest: pointer to the destination array
* @param set_val: byte value, the dest array will be set with
* @param set_size: count of bytes, the dest array will be set with
*
* @return: pointer to dest array
*/
void *dsps_memset_aes3(void *arr_dest, uint8_t set_val, size_t set_size);
#ifdef __cplusplus
}
#endif
#if CONFIG_DSP_OPTIMIZED
#if dsps_mem_aes3_enbled
#define dsps_memcpy dsps_memcpy_aes3
#define dsps_memset dsps_memset_aes3
#else
#define dsps_memcpy memcpy
#define dsps_memset memset
#endif
#else // CONFIG_DSP_OPTIMIZED
#define dsps_memcpy memcpy
#define dsps_memset memset
#endif // CONFIG_DSP_OPTIMIZED
#endif // _dsps_mem_H_

View File

@@ -0,0 +1,21 @@
#ifndef _dsps_mem_platform_H_
#define _dsps_mem_platform_H_
#include "sdkconfig.h"
#ifdef __XTENSA__
#include <xtensa/config/core-isa.h>
#include <xtensa/config/core-matmap.h>
#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
#if CONFIG_IDF_TARGET_ESP32S3
#define dsps_mem_aes3_enbled 1
#else
#define dsps_mem_aes3_enbled 0
#endif // CONFIG_IDF_TARGET_ESP32S3
#endif //
#endif // __XTENSA__
#endif // _dsps_mem_platform_H_

View File

@@ -0,0 +1,728 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <malloc.h>
#include <stdbool.h>
#include <string.h>
#include <inttypes.h>
#include "unity.h"
#include "esp_log.h"
#include "esp_err.h"
#include "esp_dsp.h"
#include "dsps_mem.h"
#include "dsp_tests.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "freertos/semphr.h"
#include "freertos/queue.h"
#include "freertos/timers.h"
#include "esp_task_wdt.h"
#define CORNERS_CPY_SET_COUNT 200
#define MEMCPY_REPORT_LEN 100
#define MEMSET_REPORT_LEN 50
#define CALL_REPEAT_COUNT 1000
#define TEST_PINNED_NUM_TASKS 2
#define TEST_PINNED_NUM_ITERS 2
#define CPY_REPEAT_COUNT 500
#define CPY_ITERS 40
#define AREA_LENGTH 1024
static const char *TAG = "dsps_mem_access";
/*
Test functionality of the memcpy and memset functions optimized for esp32s3
Requires: esp32s3
Purpose:
- Test that esp32s3 optimized memcpy and memset have the same functionality as the original memcpy and memset
Procedure:
- Create 4 arrays, 2 source arrays (aligned and unaligned) and 2 destination arrays (aligned and unaligned)
- Initialize the destination arrays to 0, fill the source arrays with non-zero values
- Copy the desired length of content from the source array to the destination array using memcpy
- Compare the content of the destination array with the content of the source array
- Initialize the destination arrays to 0
- Repeat the 3 above steps for different copy lengths (especially corner conditions like copy 0, 1, 2... and N, N -1, N - 2.... bytes)
and following arrays alignments
- destination array 16-byte aligned, source array 16-byte aligned
- destination array unaligned, source array 16-byte aligned
- destination array 16-byte aligned, source array unaligned
- destination array unaligned, source array unaligned
- Set the desired length of the destination array using memset
- Compare the content of the destination array with the set constant
- Initialize the destination arrays to 0
- Repeat the 3 above steps for different set lengths (especially corner conditions like copy 0, 1, 2... and N, N -1, N - 2.... bytes)
and both alignments of the destination array (16-byte aligned or unaligned)
- Free the dynamic array
*/
TEST_CASE("dsps_memcpy_memset_aes3_functionality", "[dsps]")
{
const size_t arr_len = 1024;
const uint8_t set_val = 0xaa;
const size_t full_count = arr_len;
const size_t canary_bytes = 16; // canary bytes to check a possibe overflow
const unsigned int align_combinations_cpy = 4; // source and destination arrays aligned or unaligned combinations
const unsigned int align_combinations_set = 2; // destination array aligned or unaligned
uint8_t *arr_dest_align = (uint8_t *)memalign(16, (arr_len + canary_bytes) * sizeof(uint8_t));
uint8_t *arr_src_align = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
uint8_t *arr_dest_unalign = (uint8_t *)malloc((arr_len + canary_bytes) * sizeof(uint8_t));
uint8_t *arr_src_unalign = (uint8_t *)malloc(arr_len * sizeof(uint8_t));
uint8_t *arr_dest = NULL, *arr_src = NULL;
for (int i = 0; i < arr_len; i++) {
((uint8_t *)arr_src_align)[i] = (uint8_t)i;
((uint8_t *)arr_src_unalign)[i] = (uint8_t)i;
}
// canary bytes
for (int i = arr_len; i < (arr_len + canary_bytes); i++) {
((uint8_t *)arr_dest_align)[i] = 0;
((uint8_t *)arr_dest_unalign)[i] = 0;
}
// aes3 memcpy functionality
for (int align = 0; align < align_combinations_cpy; align++) { // alinged and unaligned arrays test loop
size_t byte_count[2] = {0, full_count - CORNERS_CPY_SET_COUNT}; // amount of bytes to be copied
switch (align) {
case 0: // both 16-byte aligned
arr_src = arr_src_align;
arr_dest = arr_dest_align;
break;
case 1: // destination unaligned, source aligned
arr_src = arr_src_align;
arr_dest = arr_dest_unalign;
break;
case 2: // source unaligned, destination aligned
arr_src = arr_src_unalign;
arr_dest = arr_dest_align;
break;
case 3: // both unaligned
arr_src = arr_src_unalign;
arr_dest = arr_dest_unalign;
break;
default: // default - both aligned
arr_src = arr_src_align;
arr_dest = arr_dest_align;
break;
}
for (int var = 0; var < 2; var++) { // test conrner conditions
for (int j = 0; j < CORNERS_CPY_SET_COUNT; j++) { // mem_set from 1 to CORNERS_CPY_SET_COUNT
// from (full_count - CORNERS_CPY_SET_COUNT + 1) to full_count
for (int i = 0; i < full_count; i++) { // Destination array initializing
((uint8_t *)arr_dest)[i] = 0;
}
dsps_memcpy((void *)arr_dest, (void *)arr_src, ++byte_count[var]);
TEST_ASSERT_EQUAL_UINT8_ARRAY(arr_src, arr_dest, byte_count[var]);
if (byte_count[var] < arr_len) {
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[byte_count[var]], (arr_len - byte_count[var]));
}
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[arr_len], canary_bytes);
}
}
}
// aes3 memset functionality
for (int align = 0; align < align_combinations_set; align++ ) { // alinged and unaligned arrays test loop
size_t byte_count[2] = {0, full_count - CORNERS_CPY_SET_COUNT}; // amount of bytes to be copied
if (!align) {
arr_dest = arr_dest_align;
} else {
arr_dest = arr_dest_unalign;
}
for (int var = 0; var < 2; var++) { // test conrner conditions
for (int j = 0; j < CORNERS_CPY_SET_COUNT; j++) { // mem_set from 1 to CORNERS_CPY_SET_COUNT
// from (full_count - CORNERS_CPY_SET_COUNT + 1) to full_count
for (int i = 0; i < full_count; i++) { // Destination array initializing
((uint8_t *)arr_dest)[i] = 0;
}
dsps_memset((void *)arr_dest, set_val, ++byte_count[var]);
TEST_ASSERT_EACH_EQUAL_UINT8(set_val, arr_dest, byte_count[var]);
if (byte_count[var] < arr_len) {
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[byte_count[var]], (arr_len - byte_count[var]));
}
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[arr_len], canary_bytes);
}
}
}
free(arr_dest_align);
free(arr_src_align);
free(arr_dest_unalign);
free(arr_src_unalign);
}
/*
Test micro-benchmark of the memcpy and memset functions optimized for esp32s3 and esp32
Requires: esp32s3
Purpose:
- Test how fast the esp32s3 optimized memcpy and memset are compared to the esp32 optimized memcpy and memset
Procedure:
- Create 2 unaligned arrays, source and destination array
- Copy the content of the source array to the destination array using esp32s3 memcpy N times, while counting CPU cycles
- Copy the content of the source array to the destination array using esp32 memcpy N times, while counting CPU cycles
- Set the destination array using esp32s3 memcpy N times, while counting CPU cycles
- Set the destination array using esp32 memcpy N times, while counting CPU cycles
- Calculate benchmarks
- Free both arrays
*/
TEST_CASE("dsps_memcpy_memset_aes3_benchmark", "[dsps]")
{
const size_t area_len = AREA_LENGTH; // full length of the area (in bytes)
const size_t full_count = sizeof(uint8_t) * area_len;
const uint8_t set_val = 0xee; // constant value, the destination array will be set with
uint8_t *arr_src = (uint8_t *)malloc(area_len * sizeof(uint8_t));
uint8_t *arr_dest = (uint8_t *)malloc(area_len * sizeof(uint8_t));
// Memcpy benchmark
const unsigned int start_aes3_memcpy = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
dsps_memcpy((void *)arr_dest, (void *)arr_src, full_count);
}
const unsigned int end_aes3_memcpy = dsp_get_cpu_cycle_count();
const unsigned int start_ae32_memcpy = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
memcpy((void *)arr_dest, (void *)arr_src, full_count);
}
const unsigned int end_ae32_memcpy = dsp_get_cpu_cycle_count();
const float aes3_cycles_memcpy = ((float)(end_aes3_memcpy - start_aes3_memcpy)) / CALL_REPEAT_COUNT;
const float ae32_cycles_memcpy = ((float)(end_ae32_memcpy - start_ae32_memcpy)) / CALL_REPEAT_COUNT;
ESP_LOGI(TAG, "Micro benchmark of memcpy for unaligned array of %"PRIu32" bytes", (uint32_t)full_count);
ESP_LOGI(TAG, "Not-optimized cycles = %.2f", ae32_cycles_memcpy);
ESP_LOGI(TAG, "S3 optimized cycles = %.2f", aes3_cycles_memcpy);
// Memset benchmark
const unsigned int start_aes3_memset = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
dsps_memset((void *)arr_dest, set_val, full_count);
}
const unsigned int end_aes3_memset = dsp_get_cpu_cycle_count();
const unsigned int start_ae32_memset = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
memset((void *)arr_dest, set_val, full_count);
}
const unsigned int end_ae32_memset = dsp_get_cpu_cycle_count();
const float ae32_cycles_memset = ((float)(end_ae32_memset - start_ae32_memset)) / CALL_REPEAT_COUNT;
const float aes3_cycles_memset = ((float)(end_aes3_memset - start_aes3_memset)) / CALL_REPEAT_COUNT;
ESP_LOGI(TAG, "Micro benchmark of memset for unaligned array of %"PRIu32" bytes", (uint32_t)full_count);
ESP_LOGI(TAG, "Not-optimized cycles = %.2f", ae32_cycles_memset);
ESP_LOGI(TAG, "S3 optimized cycles = %.2f", aes3_cycles_memset);
free(arr_src);
free(arr_dest);
}
/*
Test micro-benchmark of the memcpy optimized for esp32s3 and esp32 and print a comparison report for copy lengths from
1 to 200 bytes, where the difference between the two memcpys is not unanimous
Requires: esp32s3
Purpose:
- Test how fast the esp32s3 optimized memcpy is to the esp32 optimized memcpy
Procedure:
- Create 2 aligned arrays, source and destination array
- Copy the content of the source array to the destination array using esp32s3 memcpy N times, while counting CPU cycles
- Copy the content of the source array to the destination array using esp32 memcpy N times, while counting CPU cycles
- Calculate benchmarks and save the result
- Repeat the 3 above steps for different copy lengths (from 1 to 200 bytes)
and following arrays alignments
- destination array 16-byte aligned, source array 16-byte aligned
- destination array unaligned, source array 16-byte aligned
- destination array 16-byte aligned, source array unaligned
- destination array unaligned, source array unaligned
- Print table of results
- Free dynamic arrays
*/
TEST_CASE("dsps_memcpy_benchmark_report", "[dsps]")
{
unsigned int start_count, end_count;
const unsigned int align_combinations = 4; // source and destination arrays aligned or unaligned combinations
const int32_t arr_len = 256;
uint8_t *arr_dest = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
uint8_t *arr_src = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
uint8_t *arr_dest_align = NULL, *arr_src_align = NULL;
uint16_t **result_aes3 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_aes3[align_combinations][MEMCPY_REPORT_LEN]
uint16_t **result_ae32 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_ae32[align_combinations][MEMCPY_REPORT_LEN]
for (int i = 0; i < align_combinations; i++) {
result_aes3[i] = (uint16_t *)malloc(MEMCPY_REPORT_LEN * sizeof(uint16_t));
result_ae32[i] = (uint16_t *)malloc(MEMCPY_REPORT_LEN * sizeof(uint16_t));
}
for (int iter = 0; iter < align_combinations; iter++) {
switch (iter) {
case 0: // both 16-byte aligned
arr_dest_align = arr_dest;
arr_src_align = arr_src;
break;
case 1: // destination unaligned, source aligned
arr_dest_align = arr_dest + 1;
arr_src_align = arr_src;
break;
case 2: // source unaligned, destination aligned
arr_dest_align = arr_dest;
arr_src_align = arr_src + 1;
break;
case 3: // both unaligned
arr_dest_align = arr_dest + 1;
arr_src_align = arr_src + 1;
break;
default: // default - both aligned
arr_dest_align = arr_dest;
arr_src_align = arr_src;
break;
}
for (int cpy_amount = 1; cpy_amount <= MEMCPY_REPORT_LEN; cpy_amount++) {
start_count = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
dsps_memcpy((void *)arr_dest_align, (void *)arr_src_align, cpy_amount);
}
end_count = dsp_get_cpu_cycle_count();
result_aes3[iter][cpy_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
start_count = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
memcpy((void *)arr_dest_align, (void *)arr_src_align, cpy_amount);
}
end_count = dsp_get_cpu_cycle_count();
result_ae32[iter][cpy_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
}
}
ESP_LOGI(TAG, "Cycle counts for aligned/unaligned source/destination array using default xtensa memcpy and s3 optimized memcpy");
printf("\n\tdest aligned \tdest unaligned\tdest aligned\tdest unaligned\n");
printf( "\tsrc aligned \tsrc aligned\tsrc unaligned\tsrc unaligned\n\n");
printf( "byte \taes3 ae32\taes3 ae32\taes3 ae32\taes3 ae32\n");
for (int i = 0; i < MEMCPY_REPORT_LEN; i++) {
printf("%d\t", i + 1);
for (int j = 0; j < align_combinations; j++) {
printf(" %d\t", result_aes3[j][i]);
printf(" %d\t", result_ae32[j][i]);
}
putchar('\n');
}
for (int i = 0; i < MEMCPY_REPORT_LEN; i++) {
for (int j = 0; j < align_combinations; j++) {
TEST_ASSERT_GREATER_OR_EQUAL((result_ae32[j][i]) / 4, result_aes3[j][i]);
}
}
free(arr_dest);
free(arr_src);
free(result_ae32);
free(result_aes3);
}
/*
Test micro-benchmark of the memset optimized for esp32s3 and esp32 and print a comparison report for set lengths from
1 to 200 bytes, where the difference between the two memsets is not unanimous
Requires: esp32s3
Purpose:
- Test how fast the esp32s3 optimized memset is compared to the esp32 optimized memset
Procedure:
- Create 1 aligned array - destination array
- Set the destination array using esp32s3 memcpy N times, while counting CPU cycles
- Set the destination array using esp32 memcpy N times, while counting CPU cycles
- Calculate benchmarks and save the result
- Repeat the 3 above steps for different copy lengths (from 1 to 200 bytes)
and both destination arrays alignments (16-byte aligned and unaligned)
- Print table of results
- Free dynamic arrays
*/
TEST_CASE("dsps_memset_benchmark_report", "[dsps]")
{
unsigned int start_count, end_count;
const unsigned int align_combinations = 2; // destination arrays aligned or unaligned
const int32_t arr_len = 256;
const uint8_t set_val = 0xaa;
uint8_t *arr_dest = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
uint8_t *arr_dest_align = NULL;
uint16_t **result_aes3 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_aes3[align_combinations][MEMSET_REPORT_LEN]
uint16_t **result_ae32 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_ae32[align_combinations][MEMSET_REPORT_LEN]
for (int i = 0; i < align_combinations; i++) {
result_aes3[i] = (uint16_t *)malloc(MEMSET_REPORT_LEN * sizeof(uint16_t));
result_ae32[i] = (uint16_t *)malloc(MEMSET_REPORT_LEN * sizeof(uint16_t));
}
for (int iter = 0; iter < align_combinations; iter++) {
if (iter == 0) {
arr_dest_align = arr_dest; // destination 16-byte aligned
} else {
arr_dest_align = arr_dest + 1; // destination unaligned
}
for (int set_amount = 1; set_amount <= MEMSET_REPORT_LEN; set_amount++) {
start_count = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
dsps_memset((void *)arr_dest_align, set_val, set_amount);
}
end_count = dsp_get_cpu_cycle_count();
result_aes3[iter][set_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
start_count = dsp_get_cpu_cycle_count();
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
memset((void *)arr_dest_align, set_val, set_amount);
}
end_count = dsp_get_cpu_cycle_count();
result_ae32[iter][set_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
}
}
ESP_LOGI(TAG, "Cycle counts for aligned/unaligned destination array using default xtensa memcpy and s3 optimized memcpy");
printf("\n\tdest aligned \tdest unaligned\n\n");
printf( "byte \taes3 ae32\taes3 ae32\n");
for (int i = 0; i < MEMSET_REPORT_LEN; i++) {
printf("%d\t", i + 1);
for (int j = 0; j < align_combinations; j++) {
printf(" %d\t", result_aes3[j][i]);
printf(" %d\t", result_ae32[j][i]);
}
putchar('\n');
}
for (int i = 0; i < MEMSET_REPORT_LEN; i++) {
for (int j = 0; j < align_combinations; j++) {
TEST_ASSERT_GREATER_OR_EQUAL((result_ae32[j][i]) / 8, result_aes3[j][i]);
}
}
free(arr_dest);
free(result_ae32);
free(result_aes3);
}
/*
Test micro-benchmark of the memcpy and memset functions optimized for esp32s3, with task switching
Requires: esp32s3
Purpose:
- Test how fast the esp32s3 optimized memcpy and memset are while running memset and memcpy in multiple tasks
Procedure:
- Create 4 tasks - 2 tasks per each core. Tasks are pinned to cores and all the tasks are the same.
- Run the memcpy micro-benchmark routine (from the previous test case) in each of the tasks.
- Start all the tasks simultaneously
- Wait for the tasks to complete, then delete the tasks
- Get the benchmark result
- Repeat all the above steps with memset, instead of memcpy
- Free the created dynamic arrays
*/
typedef struct {
SemaphoreHandle_t semaphore;
uint8_t *arr_src;
uint8_t *arr_dest;
uint8_t set_val;
size_t area_len;
uint32_t mean_val_cpy;
uint32_t mean_val_set;
} test_context_benchmark_t;
static void pinned_task_benchmark_memcpy(void *arg)
{
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
test_context_benchmark_t *context = (test_context_benchmark_t *)arg;
long unsigned int cycles_acc = 0;
unsigned int start_memcpy_count, end_memcpy_count;
for (int j = 0; j < CPY_ITERS; j++) {
start_memcpy_count = dsp_get_cpu_cycle_count();
for (int i = 0; i < CPY_REPEAT_COUNT; i++) {
dsps_memcpy((void *)context->arr_dest, (void *)context->arr_src, context->area_len);
}
end_memcpy_count = dsp_get_cpu_cycle_count();
cycles_acc += (end_memcpy_count - start_memcpy_count);
vTaskDelay(1); // Block to cause a context switch, forcing the TIE context to be saved
}
context->mean_val_cpy += (uint32_t)((cycles_acc / CPY_REPEAT_COUNT) / CPY_ITERS);
// Indicate done and wait to be deleted
xSemaphoreGive(context->semaphore);
vTaskSuspend(NULL);
}
static void pinned_task_benchmark_memset(void *arg)
{
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
test_context_benchmark_t *context = (test_context_benchmark_t *)arg;
long unsigned int cycles_acc = 0;
unsigned int start_memset_count, end_memset_count;
for (int j = 0; j < CPY_ITERS; j++) {
start_memset_count = dsp_get_cpu_cycle_count();
for (int i = 0; i < CPY_REPEAT_COUNT; i++) {
dsps_memset((void *)context->arr_dest, context->set_val, context->area_len);
}
end_memset_count = dsp_get_cpu_cycle_count();
cycles_acc += (end_memset_count - start_memset_count);
vTaskDelay(1); // Block to cause a context switch, forcing the TIE context to be saved
}
context->mean_val_set += (uint32_t)((cycles_acc / CPY_REPEAT_COUNT) / CPY_ITERS);
// Indicate done and wait to be deleted
xSemaphoreGive(context->semaphore);
vTaskSuspend(NULL);
}
TEST_CASE("dsps_memset_memcpy_context_switch_benchmark", "[dsps]")
{
test_context_benchmark_t test_context;
char task_name[10];
test_context.semaphore = xSemaphoreCreateCounting(configNUM_CORES * TEST_PINNED_NUM_TASKS, 0);
test_context.area_len = (size_t)AREA_LENGTH;
test_context.arr_dest = (uint8_t *)malloc(AREA_LENGTH * sizeof(uint8_t));
test_context.arr_src = (uint8_t *)malloc(AREA_LENGTH * sizeof(uint8_t));
test_context.set_val = 0xab;
test_context.mean_val_cpy = 0;
test_context.mean_val_set = 0;
static void (*pinned_functions[2])(void *);
pinned_functions[0] = pinned_task_benchmark_memcpy;
pinned_functions[1] = pinned_task_benchmark_memset;
TEST_ASSERT_NOT_EQUAL(NULL, test_context.semaphore);
for (int iter = 0; iter < TEST_PINNED_NUM_ITERS; iter++) {
TaskHandle_t task_handles[configNUM_CORES][TEST_PINNED_NUM_TASKS];
// Create test tasks for each core
for (int i = 0; i < configNUM_CORES; i++) {
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
sprintf(task_name, "task %d-%d", i, j);
TEST_ASSERT_EQUAL(pdTRUE, xTaskCreatePinnedToCore(pinned_functions[iter], task_name, 4096,
&test_context, 10, &task_handles[i][j], i));
}
}
// Start the created tasks simultaneously
for (int i = 0; i < configNUM_CORES; i++) {
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
xTaskNotifyGive(task_handles[i][j]);
}
}
// Wait for the tasks to complete
for (int i = 0; i < configNUM_CORES * TEST_PINNED_NUM_TASKS; i++) {
xSemaphoreTake(test_context.semaphore, portMAX_DELAY);
}
// Delete the tasks
for (int i = 0; i < configNUM_CORES; i++) {
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
vTaskDelete(task_handles[i][j]);
}
}
vTaskDelay(10); // Short delay to allow idle task to be free task memory and TIE contexts
}
vSemaphoreDelete(test_context.semaphore);
free(test_context.arr_dest);
free(test_context.arr_src);
const uint32_t iterations = (uint32_t)(configNUM_CORES * TEST_PINNED_NUM_TASKS * CPY_REPEAT_COUNT * CPY_ITERS);
const uint32_t copy_mean_val = (uint32_t)(test_context.mean_val_cpy / (configNUM_CORES * TEST_PINNED_NUM_TASKS));
const uint32_t set_mean_val = (uint32_t)(test_context.mean_val_set / (configNUM_CORES * TEST_PINNED_NUM_TASKS));
printf("\nOut of %"PRIu32" iterations, array len of %"PRIu32" bytes\n", iterations, (uint32_t)AREA_LENGTH);
printf("Memcpy cycles = %"PRIu32"\n", copy_mean_val);
printf("Memset cycles = %"PRIu32"\n", set_mean_val);
}
/*
Test context switching for the TIE disabled and enabled
Requires: esp32s3
Purpose:
- Compare context switching between the tasks when TIE (esp32s3 instruction extension) is enabled and disabled to
see what is the switching time overhead for the TIE enabled
Procedure:
- Create a timer, 1000 ms is used for this test, but the exact time is not crucial
- Create 4 tasks - 2 tasks per each core. Tasks are pinned to cores and all the tasks are the same
- Start the created tasks simultaneously, start the timer
- A task executes a single assembler instruction from the TIE, to induce the context switch
- As soon, as the instruction is executed, a context switch occurs
- A counter counts number or context switcher within the timer interval specified by the timer
- Wait for the timer to expire and terminate the tasks
- Get the number of task switches and delete all the tasks
- Repeat the 7 above steps with the created tasks executing a single generic Xtensa assembler instruction,
instead of the TIE instruction to get the switching overhead
*/
static bool timer_expired = false;
static TimerHandle_t one_shot_timer = NULL;
typedef struct {
SemaphoreHandle_t semaphore;
uint32_t switch_count_tie_on;
uint32_t switch_count_tie_off;
} test_context_timing_t;
// Taks pinned to a core, executing TIE instruction
static void pinned_task_tie_on(void *arg)
{
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
test_context_timing_t *context = (test_context_timing_t *)arg;
vTaskDelay(1);
while (!timer_expired) {
asm volatile("ee.zero.q q0");
context->switch_count_tie_on++;
taskYIELD(); // Block to cause a context switch, forcing the TIE context to be saved
}
xSemaphoreGive(context->semaphore);
vTaskSuspend(NULL);
}
// Taks pinned to a core, executing generic Xtensa instruction
static void pinned_task_tie_off(void *arg)
{
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
test_context_timing_t *context = (test_context_timing_t *)arg;
vTaskDelay(1);
while (!timer_expired) {
asm volatile("nop");
context->switch_count_tie_off++;
taskYIELD(); // Block to cause a context switch, forcing the context to be saved
}
xSemaphoreGive(context->semaphore);
vTaskSuspend(NULL);
}
static void context_switch_timer_callback(TimerHandle_t xTimer)
{
timer_expired = true;
}
TEST_CASE("dsps_TIE_context_switch_timing", "[dsps]")
{
test_context_timing_t test_context;
const TickType_t timer_period_ms = 1000;
char task_name[10];
test_context.semaphore = xSemaphoreCreateCounting(configNUM_CORES * TEST_PINNED_NUM_TASKS, 0);
test_context.switch_count_tie_off = 0;
test_context.switch_count_tie_on = 0;
TEST_ASSERT_NOT_EQUAL(NULL, test_context.semaphore);
static void (*pinned_functions[2])(void *);
pinned_functions[0] = pinned_task_tie_on;
pinned_functions[1] = pinned_task_tie_off;
one_shot_timer = xTimerCreate("timer", pdMS_TO_TICKS(timer_period_ms), pdFALSE, (void *)0, context_switch_timer_callback);
for (int iter = 0; iter < TEST_PINNED_NUM_ITERS; iter++) {
timer_expired = false;
TaskHandle_t task_handles[configNUM_CORES][TEST_PINNED_NUM_TASKS];
// Create test tasks for each core
for (int i = 0; i < configNUM_CORES; i++) {
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
sprintf(task_name, "task %d-%d", i, j);
TEST_ASSERT_EQUAL(pdTRUE, xTaskCreatePinnedToCore(pinned_functions[iter], task_name, 4096,
&test_context, 1, &task_handles[i][j], i));
}
}
// Start the created tasks simultaneously
for (int i = 0; i < configNUM_CORES; i++) {
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
xTaskNotifyGive(task_handles[i][j]);
}
}
xTimerStart(one_shot_timer, portMAX_DELAY);
vTaskDelay(1);
// Wait for the tasks to complete
for (int i = 0; i < configNUM_CORES * TEST_PINNED_NUM_TASKS; i++) {
xSemaphoreTake(test_context.semaphore, portMAX_DELAY);
}
// Delete the tasks
for (int i = 0; i < configNUM_CORES; i++) {
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
vTaskDelete(task_handles[i][j]);
}
}
vTaskDelay(10); // Short delay to allow idle task to be free task memory and TIE contexts
}
vSemaphoreDelete(test_context.semaphore);
printf("\nContext switching count within %"PRIu32" ms nterval\n", (uint32_t)timer_period_ms);
printf("TIE enabled %"PRIu32"\n", test_context.switch_count_tie_on);
printf("TIE disabled %"PRIu32"\n", test_context.switch_count_tie_off);
float overhead = (((float)test_context.switch_count_tie_off / (float)test_context.switch_count_tie_on) * 100) - 100;
printf("Switch overhead %.2f %%\n", overhead);
}

View File

@@ -0,0 +1,30 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_d_gen.h"
esp_err_t dsps_d_gen_f32(float *output, int len, int pos)
{
if (pos >= len) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (pos < 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
for (int i = 0 ; i < len ; i++) {
output[i] = 0;
}
output[pos] = 1;
return ESP_OK;
}

View File

@@ -0,0 +1,32 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_h_gen.h"
esp_err_t dsps_h_gen_f32(float *output, int len, int pos)
{
if (pos >= len) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (pos < 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
for (int i = 0 ; i < pos ; i++) {
output[i] = 0;
}
for (int i = pos ; i < len ; i++) {
output[i] = 1;
}
return ESP_OK;
}

View File

@@ -0,0 +1,39 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_tone_gen.h"
#include <math.h>
esp_err_t dsps_tone_gen_f32(float *output, int len, float Ampl, float freq, float phase)
{
if (freq >= 1) {
return ESP_ERR_DSP_INVALID_PARAM;
}
if (freq <= -1) {
return ESP_ERR_DSP_INVALID_PARAM;
}
float ph = phase / 180 * M_PI;
float fr = 2 * M_PI * freq;
for (int i = 0 ; i < len ; i++) {
output[i] = Ampl * sin(ph);
ph += fr;
if (ph > 2 * M_PI) {
ph -= 2 * M_PI;
}
if (ph < -2 * M_PI) {
ph += 2 * M_PI;
}
}
return ESP_OK;
}

View File

@@ -0,0 +1,74 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_sfdr.h"
#include "dsps_fft2r.h"
#include "dsp_common.h"
#include <math.h>
#include <limits>
#include "esp_log.h"
static const char *TAG = "sfdr";
float dsps_sfdr_f32(const float *input, int32_t len, int8_t use_dc)
{
if (!dsp_is_power_of_two(len)) {
return 0;
}
float *temp_array = new float[len * 2];
for (int i = 0 ; i < len ; i++) {
float wind = 0.5 * (1 - cosf(i * 2 * M_PI / (float)len));
temp_array[i * 2 + 0] = input[i] * wind;
temp_array[i * 2 + 1] = 0;
}
dsps_fft2r_init_fc32(NULL, CONFIG_DSP_MAX_FFT_SIZE);
dsps_fft2r_fc32_ansi(temp_array, len);
dsps_bit_rev_fc32_ansi(temp_array, len);
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
int max_pos = 0;
for (int i = 0 ; i < len / 2 ; i++) {
temp_array[i] = 10 * log10f(temp_array[i * 2 + 0] * temp_array[i * 2 + 0] + temp_array[i * 2 + 1] * temp_array[i * 2 + 1]);
if (temp_array[i] < min) {
min = temp_array[i];
}
if (temp_array[i] > max) {
max = temp_array[i];
max_pos = i;
}
ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB", i, temp_array[i]);
}
int start_pos = 0;
int wind_width = 5;
float min_diff = std::numeric_limits<float>::max();
if (use_dc == 0) {
start_pos = wind_width;
}
for (int i = start_pos ; i < len / 2 ; i++) {
if ((i < (max_pos - wind_width)) || (i > (max_pos + wind_width))) {
float diff = max - temp_array[i];
if (diff < min_diff) {
ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB, maX=%f, max_pos=%i", i, temp_array[i], max, max_pos);
min_diff = diff;
}
}
}
delete[] temp_array;
return min_diff;
}

View File

@@ -0,0 +1,43 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dsps_view.h"
#include "dsps_sfdr.h"
#include "dsps_fft2r.h"
static const char *TAG = "dsps_sfdr_f32";
TEST_CASE("dsps_sfdr_f32 functionality", "[dsps]")
{
int N = 512;
float *data = (float *)malloc(N * 2 * sizeof(float));
int check_bin = 32;
float sfdr_exp = 4;
for (int i = 0 ; i < N ; i++) {
data[i] = 4 * sinf(M_PI / N * check_bin * i) / (N / 2);
data[i] += sinf(M_PI / N * check_bin * i * 2) / (N / 2);
}
float sfdr = dsps_sfdr_f32(data, N, 1);
TEST_ASSERT_EQUAL( (int)20 * log10(sfdr_exp), (int)sfdr);
ESP_LOGI(TAG, "dsps_sfdr_f32 = %f dB", sfdr);
dsps_fft2r_deinit_fc32();
free(data);
}

View File

@@ -0,0 +1,78 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_snr.h"
#include "dsps_fft2r.h"
#include "dsp_common.h"
#include <math.h>
#include <limits>
#include "esp_log.h"
static const char *TAG = "snr";
float dsps_snr_f32(const float *input, int32_t len, uint8_t use_dc)
{
if (!dsp_is_power_of_two(len)) {
return 0;
}
float *temp_array = new float[len * 2];
for (int i = 0 ; i < len ; i++) {
float wind = 0.5 * (1 - cosf(i * 2 * M_PI / (float)len));
temp_array[i * 2 + 0] = input[i] * wind;
temp_array[i * 2 + 1] = 0;
}
dsps_fft2r_init_fc32(NULL, CONFIG_DSP_MAX_FFT_SIZE);
dsps_fft2r_fc32_ansi(temp_array, len);
dsps_bit_rev_fc32_ansi(temp_array, len);
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
int max_pos = 0;
for (int i = 0 ; i < len / 2 ; i++) {
temp_array[i] = temp_array[i * 2 + 0] * temp_array[i * 2 + 0] + temp_array[i * 2 + 1] * temp_array[i * 2 + 1];
if (temp_array[i] < min) {
min = temp_array[i];
}
if (temp_array[i] > max) {
max = temp_array[i];
max_pos = i;
}
ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB", i, temp_array[i]);
}
int start_pos = 0;
int wind_width = 7;
if (use_dc == 0) {
start_pos = wind_width;
}
float noise_power = 0;
for (int i = start_pos ; i < len / 2 ; i++) {
if ((i < (max_pos - wind_width)) || (i > (max_pos + wind_width))) {
noise_power += temp_array[i];
ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB, maX=%f, max_pos=%i, noise_power=%f", i, temp_array[i], max, max_pos, noise_power);
}
}
delete[] temp_array;
noise_power += std::numeric_limits<float>::min();
if (noise_power < max * 0.00000000001) {
return 192;
}
float snr = max / noise_power;
float result = 10 * log10(max / noise_power) - 2; // 2 - window correction
ESP_LOGI(TAG, "SNR = %f, result=%f dB", snr, result);
return result;
}

View File

@@ -0,0 +1,44 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dsps_view.h"
#include "dsps_snr.h"
#include "dsps_fft2r.h"
static const char *TAG = "dsps_snr_f32";
TEST_CASE("dsps_snr_f32 functionality", "[dsps]")
{
int N = 512;
float *data = (float *)malloc(N * 2 * sizeof(float));
int check_bin = 32;
float snr_exp = 0.001;
for (int i = 0 ; i < N ; i++) {
data[i] = 1 * sinf(M_PI / N * check_bin * i) / (N / 2);
data[i] += 0.001 / N; //0.1*sinf(M_PI/N*check_bin*i*2)/(N/2);
}
float snr = dsps_snr_f32(data, N, 1);
TEST_ASSERT_EQUAL(-round(20 * log10(snr_exp) + 3), (int)round(snr));
ESP_LOGI(TAG, "dsps_snr_f32 = %f dB", snr);
dsps_fft2r_deinit_fc32();
free(data);
}

View File

@@ -0,0 +1,120 @@
#include "dsps_view.h"
#include <math.h>
#include "esp_log.h"
#include <limits>
#include <inttypes.h>
void dsps_view(const float *data, int32_t len, int width, int height, float min, float max, char view_char)
{
uint8_t *view_data = new uint8_t[width * height];
float *view_data_min = new float[width];
float *view_data_max = new float[width];
//
for (int y = 0; y < height ; y++) {
for (int x = 0 ; x < width ; x++) {
view_data[y * width + x] = ' ';
}
}
for (int i = 0 ; i < width ; i++) {
view_data_min[i] = max;
view_data_max[i] = min;
}
float x_step = (float)(width) / (float)len;
float y_step = (float)(height - 1) / (max - min);
float data_min = std::numeric_limits<float>::max();
float data_max = std::numeric_limits<float>::min();
int min_pos = 0;
int max_pos = 0;
for (int i = 0 ; i < len ; i++) {
int x_pos = i * x_step;
if (data[i] < view_data_min[x_pos]) {
view_data_min[x_pos] = data[i];
}
if (data[i] > view_data_max[x_pos]) {
view_data_max[x_pos] = data[i];
}
if (view_data_min[x_pos] < min) {
view_data_min[x_pos] = min;
}
if (view_data_max[x_pos] > max) {
view_data_max[x_pos] = max;
}
ESP_LOGD("view", "for i=%i, x_pos=%i, max=%f, min=%f, data=%f", i, x_pos, view_data_min[x_pos], view_data_max[x_pos], data[i]);
if (data[i] > data_max) {
data_max = data[i];
max_pos = i;
}
if (data[i] < data_min) {
data_min = data[i];
min_pos = i;
}
}
ESP_LOGI("view", "Data min[%i] = %f, Data max[%i] = %f", min_pos, data_min, max_pos, data_max);
ESP_LOGD("view", "y_step = %f", y_step);
for (int x = 0 ; x < width ; x++) {
int y_count = (view_data_max[x] - view_data_min[x]) * y_step + 1;
ESP_LOGD("view", "For x= %i y_count=%i ,min =%f, max=%f, ... ", x, y_count, view_data_min[x], view_data_max[x]);
for (int y = 0 ; y < y_count ; y++) {
int y_pos = (max - view_data_max[x]) * y_step + y;
ESP_LOGD("view", " %i, ", y_pos);
view_data[y_pos * width + x] = view_char;
}
ESP_LOGD("view", " ");
}
// Simple output
// for (int i=0 ; i< len ; i++)
// {
// float x_step = (float)(width-1)/(float)len;
// float y_step = (float)(height-1)/(max - min);
// int x_pos = i*x_step;
// int y_pos = data[i]*y_step;
// if (data[i] >= max) y_pos = 0;
// if (data[i] <= min) y_pos = height-1;
// view_data[y_pos*width + x_pos] = view_char;
// printf("For data[%i]=%f, x_pos%i, y_pos=%i\n", i, data[i], x_pos, y_pos);
// }
// printf("\n");
printf(" ");
for (int x = 0 ; x < width ; x++) {
printf("_");
}
printf("\n");
for (int y = 0; y < height ; y++) {
printf("%i", y % 10);
for (int x = 0 ; x < width ; x++) {
printf("%c", view_data[y * width + x]);
}
printf("|\n");
}
printf(" ");
for (int x = 0 ; x < width ; x++) {
printf("%i", x % 10);
}
printf("\n");
ESP_LOGI("view", "Plot: Length=%i, min=%f, max=%f", (int)len, min, max);
delete[] view_data;
delete[] view_data_min;
delete[] view_data_max;
}
void dsps_view_s16(const int16_t *data, int32_t len, int width, int height, float min, float max, char view_char)
{
float *view_data = new float[len];
for (size_t i = 0; i < len; i++) {
// view_data[i] = ((float)data[i])/32768.0f;
view_data[i] = data[i];
view_data[i] /= 32768;
}
dsps_view(view_data, len, width, height, min, max, view_char);
delete[] view_data;
}
void dsps_view_spectrum(const float *data, int32_t len, float min, float max)
{
dsps_view(data, len, 64, 10, min, max, '|');
}

View File

@@ -0,0 +1,36 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dsps_view.h"
static const char *TAG = "dsps_view";
TEST_CASE("dsps_view functionality", "[dsps]")
{
float *data = (float *)malloc(1024 * sizeof(float));
for (int i = 0 ; i < 1024 ; i++) {
data[i] = -100;
}
data[256] = 0;
dsps_view_spectrum(data, 1024, -100, 0);
ESP_LOGI(TAG, "Just a check\n");
free(data);
}