add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/espressif__esp-dsp/modules/support/cplx_gen/dsps_cplx_gen.S
+++ b/managed_components/espressif__esp-dsp/modules/support/cplx_gen/dsps_cplx_gen.S
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_cplx_gen_platform.h"
+#if (dsps_cplx_gen_aes3_enbled || dsps_cplx_gen_ae32_enbled)
+
+// This is a Complex signal generator for ESP32 processor.
+    .text
+    .align  4
+    .global dsps_cplx_gen_ae32
+    .type   dsps_cplx_gen_ae32,@function
+// The function implements the following C code:
+// esp_err_t dsps_cplx_gen_ae32(cplx_sig_t *cplx_gen, void *output, int32_t len);
+
+dsps_cplx_gen_ae32:
+
+// Input params                 Variables float             Variables fixed
+//
+// cplx_gen - a2                fr              - f0        lut             - a5
+// output   - a3                one_const       - f1        lut_len         - a6
+// len      - a4                lut_len_f       - f2        sin_pos         - a7
+//                              ph_f            - f3        cos_pos         - a8
+//                              sin_pos_f       - f4        sin_to_cos      - a9
+//                                                          ph_floor        - a10
+//                                                          modulo          - a11
+
+    entry    a1,    32
+    l32i     a5,    a2,    0                    // a5 - lut
+    l32i     a6,    a2,    4                    // a6 - lut_len
+    lsi      f0,    a2,    8                    // f0 - fr
+    lsi      f3,    a2,    12                   // f3 - ph_f (phase increment)
+    const.s  f1,     1                          // f1 - constant 1
+    float.s  f2,    a6,    0                    // f2 - lut_len_f
+    srli     a9,    a6,    2                    // a9 - sin_to_cos = lut_len / 4
+    addi     a11,   a6,   -1                    // a11 - modulo = lut_len - 1
+
+    l32i     a15,   a2,    16                   // a15 - d_type
+    beqz     a15, _s16_fixed
+
+    // F32 floating point
+    loopnez a4, ._main_loop_float
+
+        floor.s     a10,   f3,   0              // turncate wiht rounding towards -infinity
+
+        // branch if ph_floor is greater than 0
+        bgez    a10, _ph_check_low_float
+            add.s       f3,    f3,   f1         // f3 = f3 - f1 (ph_f + 1)
+            floor.s     a10,   f3,    0         // turncate wiht rounding towards -infinity
+        _ph_check_low_float:
+
+        // branch if ph_ceil is lower than 2 (floored to 1)
+        blti    a10, 1, _ph_check_great_float
+            sub.s   f3,    f3,   f1             // f3 = f3 - f1 (ph_f - 1)
+        _ph_check_great_float:
+
+        mul.s   f4,   f3,  f2                   // sin_pos_f = ph_f * lut_len
+        trunc.s a7,   f4,  0                    // truncate sin_pos_f to sin_pos
+
+        add     a8,   a7,  a9                   // cos_pos (a8) = sin_pos(a7) + sin_to_cos(a9)
+        and     a8,   a8,  a11                  // cos_pos = cos_pos & modulo (lut_len - 1)
+
+        slli    a8,   a8,  2                    // set index of the LUT (4 x cos_pos)
+        slli    a7,   a7,  2                    // set index of the LUT (4 x sin_pos)
+
+        lsx     f14,  a5,  a7                   // load sin LUT value form *lut
+        lsx     f15,  a5,  a8                   // load cos LUT value form *lut
+
+        ssi     f15,  a3,  0                    // save cos LUT value to the output, offset 0
+        ssi     f14,  a3,  4                    // save sin LUT value to the output, offset 4
+        add.s   f3,   f3,  f0                   // ph_f += fr
+
+        addi.n  a3,   a3,  8                    // increase the output pointer (2 x f32)
+    ._main_loop_float:
+
+    movi.n a2, 0
+    retw.n
+
+    // Q15 fixed point
+    _s16_fixed:
+    loopnez a4, ._main_loop_fixed
+
+        floor.s     a10,   f3,   0              // turncate wiht rounding towards -infinity
+
+        // branch if ph_floor is greater than 0
+        bgez    a10, _ph_check_low_fixed
+            add.s       f3,    f3,   f1         // f3 = f3 - f1 (ph_f + 1)
+            floor.s     a10,   f3,    0         // turncate wiht rounding towards -infinity
+        _ph_check_low_fixed:
+
+        // branch if ph_ceil is lower than 2 (floored to 1)
+        blti    a10, 1, _ph_check_great_fixed
+            sub.s   f3,    f3,   f1             // f3 = f3 - f1 (ph_f - 1)
+        _ph_check_great_fixed:
+
+        mul.s   f4,   f3,  f2                   // sin_pos_f = ph_f * lut_len
+        trunc.s a7,   f4,  0                    // truncate sin_pos_f to sin_pos
+
+        add     a8,   a7,  a9                   // cos_pos (a8) = sin_pos(a7) + sin_to_cos(a9)
+        and     a8,   a8,  a11                  // cos_pos = cos_pos & modulo (lut_len - 1)
+
+        addx2   a15,  a8,  a5                   // get cos index of the LUT (*lut + 2 x cos_pos)
+        addx2   a13,  a7,  a5                   // get sin index of the LUT (*lut + 2 x sin_pos)
+
+        l16si   a14,  a15, 0                    // load cos LUT value from *lut
+        l16si   a12,  a13, 0                    // load sin LUT value from *lut
+
+        s16i    a14,  a3,  0                    // save cos LUT value to the output (a3), offset 0
+        s16i    a12,  a3,  2                    // save sin LUT value to the output (a3), offset 2
+        add.s   f3,   f3,  f0                   // ph_f += fr
+
+        addi.n  a3,   a3,  4                    // increase the output pointer (2 x s16)
+    ._main_loop_fixed:
+
+    movi.n a2, 0
+    retw.n
+
+#endif // (dsps_cplx_gen_aes3_enbled || dsps_cplx_gen_ae32_enbled)
--- a/managed_components/espressif__esp-dsp/modules/support/cplx_gen/dsps_cplx_gen.c
+++ b/managed_components/espressif__esp-dsp/modules/support/cplx_gen/dsps_cplx_gen.c
@@ -0,0 +1,40 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_cplx_gen.h"
+
+esp_err_t dsps_cplx_gen_ansi(cplx_sig_t *cplx_gen, void *output, int32_t len)
+{
+    // angle frequency is already cplx_gen->freq
+    const int sin_to_cos = cplx_gen->lut_len / 4;
+    float ph = cplx_gen->phase;
+    const float fr = cplx_gen->freq;
+    int sin_pos, cos_pos;
+
+    for (int i = 0 ; i < len; i++) {
+
+        if (ph < 0) {
+            ph += 1.0;
+        }
+        if (ph >= 1.0) {
+            ph -= 1.0;
+        }
+
+        sin_pos = (int)(ph * (cplx_gen->lut_len));
+        cos_pos = (sin_pos + sin_to_cos) & (cplx_gen->lut_len - 1);
+
+        if (cplx_gen->d_type == S16_FIXED) {
+            ((int16_t *)output)[i * 2 + 0] = ((int16_t *)cplx_gen->lut)[cos_pos];
+            ((int16_t *)output)[i * 2 + 1] = ((int16_t *)cplx_gen->lut)[sin_pos];
+        } else {
+            ((float *)output)[i * 2 + 0] = ((float *)cplx_gen->lut)[cos_pos];
+            ((float *)output)[i * 2 + 1] = ((float *)cplx_gen->lut)[sin_pos];
+        }
+        ph += fr;
+    }
+
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/support/cplx_gen/dsps_cplx_gen_init.c
+++ b/managed_components/espressif__esp-dsp/modules/support/cplx_gen/dsps_cplx_gen_init.c
@@ -0,0 +1,148 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+
+#include "dsps_cplx_gen.h"
+#include "dsp_common.h"
+#include "esp_log.h"
+#include <math.h>
+#include <malloc.h>
+
+#define Q15_MAX INT16_MAX
+
+static const char *TAG = "dsps_cplx_gen";
+
+esp_err_t dsps_cplx_gen_init(cplx_sig_t *cplx_gen, out_d_type d_type, void *lut, int32_t lut_len, float freq, float initial_phase)
+{
+    cplx_gen->lut_len = lut_len;
+    cplx_gen->freq = freq;
+    cplx_gen->lut = lut;
+    cplx_gen->free_status = 0;
+    cplx_gen->d_type = d_type;
+    cplx_gen->phase = initial_phase;
+
+    // length of the LUT must be power of 2
+    if (!dsp_is_power_of_two(lut_len)) {
+        ESP_LOGE(TAG, "The length of the LUT must be power of 2");
+        return ESP_ERR_DSP_INVALID_LENGTH;
+    }
+
+    // LUT length must be in a range from 256 to 8192
+    if ((lut == NULL) && ((cplx_gen->lut_len > 8192) || (cplx_gen->lut_len < 256))) {
+        ESP_LOGE(TAG, "The length of the LUT table out of range. Valid range is 256 to 8192");
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    // frequency is a Nyquist frequency, must be in a range from (-1 to 1)
+    if ((cplx_gen->freq >= 1) || (cplx_gen->freq <= -1)) {
+        ESP_LOGE(TAG, "The frequency is out of range. Valid range is +/- 1. ");
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+
+    // initial phase in a range from (-1 to 1)
+    if ((cplx_gen->phase >= 1) || (cplx_gen->phase <= -1)) {
+        ESP_LOGE(TAG, "The phase is out of range. Valid range is +/- 1. ");
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+
+    // LUT table coefficients generation
+    if (lut == NULL) {                      // lut has not been provided by an user. Allocate and initialize it
+        cplx_gen->free_status |= 0x0001;    // lut has been allocated, free_status indicates that the space must be freed afterwards
+
+        if (cplx_gen->d_type == S16_FIXED) {                    // Q15 fixed point
+            int16_t *local_lut = (int16_t *)malloc(cplx_gen->lut_len * sizeof(int16_t));
+
+            float term;
+            for (int i = 0 ; i < cplx_gen->lut_len; i++) {
+                term = (2.0 * M_PI) * ((float)(i) / (float)(cplx_gen->lut_len));
+                local_lut[i] = (int16_t)(sin(term) * Q15_MAX);  // conversion to Q15 fixed point
+            }
+            cplx_gen->lut = (void *)local_lut;
+        } else if (cplx_gen->d_type == F32_FLOAT) {             // Single precision floating point
+            float *local_lut = (float *)malloc(cplx_gen->lut_len * sizeof(float));
+
+            float term;
+            for (int i = 0 ; i < cplx_gen->lut_len; i++) {
+                term = (2.0 * M_PI) * ((float)(i) / (float)(cplx_gen->lut_len));
+                local_lut[i] = (float)sin(term);
+            }
+            cplx_gen->lut = (void *)local_lut;
+        } else {
+            cplx_gen->lut = NULL;
+            return ESP_ERR_DSP_INVALID_PARAM;
+        }
+    }
+    return ESP_OK;
+}
+
+esp_err_t dsps_cplx_gen_freq_set(cplx_sig_t *cplx_gen, float freq)
+{
+    if ((freq >= 1) || (freq <= -1)) {          // frequency is a Nyquist frequency, must be in a range from (-1 to 1)
+        ESP_LOGE(TAG, "The frequency is out of range. Valid range is +/- 1. ");
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+
+    cplx_gen->freq = freq;
+    return ESP_OK;
+}
+
+float dsps_cplx_gen_freq_get(cplx_sig_t *cplx_gen)
+{
+    // Check if the structure was initialized
+    if (!dsp_is_power_of_two(cplx_gen->lut_len)) {
+        ESP_LOGE(TAG, "cplx_gen strucure was not initialized");
+        return -2;
+    }
+
+    return (cplx_gen->freq);
+}
+
+esp_err_t dsps_cplx_gen_phase_set(cplx_sig_t *cplx_gen, float phase)
+{
+    if ((phase >= 1) || (phase <= -1)) {        // initial phase in a range from (-1 to 1)
+        ESP_LOGE(TAG, "The phase is out of range. Valid range is +/- 1. ");
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+
+    cplx_gen->phase = phase;
+    return ESP_OK;
+}
+
+float dsps_cplx_gen_phase_get(cplx_sig_t *cplx_gen)
+{
+    // Check if the structure was initialized
+    if (!dsp_is_power_of_two(cplx_gen->lut_len)) {
+        ESP_LOGE(TAG, "cplx_gen strucure was not initialized");
+        return -2;
+    }
+
+    return (cplx_gen->phase);
+}
+
+esp_err_t dsps_cplx_gen_set(cplx_sig_t *cplx_gen, float freq, float phase)
+{
+    if ((freq >= 1) || (freq <= -1)) {          // frequency is a Nyquist frequency, must be in a range from (-1 to 1)
+        ESP_LOGE(TAG, "The frequency is out of range. Valid range is +/- 1. ");
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+
+    if ((phase >= 1) || (phase <= -1)) {        // phase in a range from (-1 to 1)
+        ESP_LOGE(TAG, "The phase is out of range. Valid range is +/- 1. ");
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+
+    cplx_gen->phase = phase;
+    cplx_gen->freq = freq;
+    return ESP_OK;
+}
+
+void cplx_gen_free(cplx_sig_t *cplx_gen)
+{
+    if (cplx_gen->free_status & 0x0001) {
+        free(cplx_gen->lut);
+        cplx_gen->free_status = 0;
+    }
+}
--- a/managed_components/espressif__esp-dsp/modules/support/cplx_gen/test/test_cplx_gen.c
+++ b/managed_components/espressif__esp-dsp/modules/support/cplx_gen/test/test_cplx_gen.c
@@ -0,0 +1,260 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <malloc.h>
+#include <stdint.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include "esp_dsp.h"
+#include <math.h>
+
+#include "dsp_tests.h"
+#include "dsps_cplx_gen.h"
+#include "dsps_wind.h"
+#include "dsps_view.h"
+#include "dsps_fft2r.h"
+
+#define LEAKAGE_BINS 10                     // fft leakage bins
+
+static const char *TAG = "dsps_cplx_gen";
+
+// Error message handler function, which detects errors returned by dsps_cplx_gen_init() function
+void error_msg_handler(cplx_sig_t *cplx_signal, esp_err_t status)
+{
+    if (status != ESP_OK) {
+        cplx_gen_free(cplx_signal);
+
+        switch (status) {
+        case ESP_ERR_DSP_INVALID_LENGTH:
+            TEST_ASSERT_MESSAGE(false, "LUT table has invalid length, must be power of 2");
+            break;
+        case ESP_ERR_DSP_PARAM_OUTOFRANGE:
+            TEST_ASSERT_MESSAGE(false, "LUT table length must be in a range from 256 to 8192");
+            break;
+        case ESP_ERR_DSP_INVALID_PARAM:
+            TEST_ASSERT_MESSAGE(false, "Frequency and initial phase must be in a range from -1 to 1");
+            break;
+        default:
+            TEST_ASSERT_MESSAGE(false, "Unspecified error");
+            break;
+        }
+    }
+}
+
+TEST_CASE("cplx_gen_functionality_test", "[dsps]")
+{
+    const int32_t out_len = 4096;
+    const int32_t lut_len = 1024;
+    const float frequency = 0.001;
+    const float init_phase = 0.1;
+
+    cplx_sig_t cplx_signal, cplx_signal_compare;
+
+    // F32 float
+    esp_err_t status1 = dsps_cplx_gen_init(&cplx_signal, F32_FLOAT, NULL, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal, status1);
+    esp_err_t status2 = dsps_cplx_gen_init(&cplx_signal_compare, F32_FLOAT, cplx_signal.lut, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal_compare, status2);
+
+    float *out_array_float = (float *)malloc(out_len * 2 * sizeof(float));          // times 2 for real and complex part
+    float *out_array_compare_float = (float *)malloc(out_len * 2 * sizeof(float));
+
+    dsps_cplx_gen_ansi(&cplx_signal_compare, (void *)out_array_compare_float, out_len);
+    dsps_cplx_gen(&cplx_signal, (void *)out_array_float, out_len);
+
+    for (int i = 0; i < out_len * 2; i++) {
+        TEST_ASSERT_EQUAL(out_array_compare_float[i], out_array_float[i]);
+    }
+
+    free(out_array_float);
+    free(out_array_compare_float);
+    cplx_gen_free(&cplx_signal);
+    cplx_gen_free(&cplx_signal_compare);
+
+    // S16 fixed
+    status1 = dsps_cplx_gen_init(&cplx_signal, S16_FIXED, NULL, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal, status1);
+    status2 = dsps_cplx_gen_init(&cplx_signal_compare, S16_FIXED, cplx_signal.lut, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal_compare, status2);
+
+    int16_t *out_array_fixed = (int16_t *)malloc(out_len * 2 * sizeof(int16_t));    // times 2 for real and complex part
+    int16_t *out_array_compare_fixed = (int16_t *)malloc(out_len * 2 * sizeof(int16_t));
+
+    dsps_cplx_gen_ansi(&cplx_signal_compare, (void *)out_array_compare_fixed, out_len);
+    dsps_cplx_gen(&cplx_signal, (void *)out_array_fixed, out_len);
+
+    for (int i = 0; i < out_len * 2; i++) {
+        TEST_ASSERT_EQUAL(out_array_compare_fixed[i], out_array_fixed[i]);
+    }
+
+    free(out_array_fixed);
+    free(out_array_compare_fixed);
+    cplx_gen_free(&cplx_signal);
+    cplx_gen_free(&cplx_signal_compare);
+}
+
+
+TEST_CASE("cplx_gen_benchmark_test", "[dsps]")
+{
+    int32_t out_len = 32;
+    const int32_t lut_len = 256;
+    const float frequency = 0.02;
+    const float init_phase = 0.9;
+    const int repeat_count = 4;
+
+    cplx_sig_t cplx_signal_float, cplx_signal_fixed;
+
+    esp_err_t status1 = dsps_cplx_gen_init(&cplx_signal_float, F32_FLOAT, NULL, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal_float, status1);
+    esp_err_t status2 = dsps_cplx_gen_init(&cplx_signal_fixed, S16_FIXED, NULL, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal_fixed, status2);
+
+    float *out_array_float = (float *)malloc(out_len * 2 * 32 * sizeof(float));    // 8192 (max_out len) * 2 (real and imaginary)
+    int16_t *out_array_fixed = (int16_t *)malloc(out_len * 2 * 32 * sizeof(int16_t));
+
+    for (int i = 0; i < 6; i++) {
+        const unsigned int start_float = dsp_get_cpu_cycle_count();
+        for (int j = 0 ; j < repeat_count ; j++) {
+            dsps_cplx_gen(&cplx_signal_float, (void *)out_array_float, out_len);
+        }
+        const unsigned int end_float = dsp_get_cpu_cycle_count();
+
+        const unsigned int start_fixed = dsp_get_cpu_cycle_count();
+        for (int j = 0 ; j < repeat_count ; j++) {
+            dsps_cplx_gen(&cplx_signal_fixed, (void *)out_array_fixed, out_len);
+        }
+        const unsigned int end_fixed = dsp_get_cpu_cycle_count();
+
+        const float total_float = end_float - start_float;
+        const float total_fixed = end_fixed - start_fixed;
+        const float cycles_float = total_float / (float)(repeat_count);
+        const float cycles_fixed = total_fixed / (float)(repeat_count);
+        const float cycles_per_lut_sample_float = total_float / (float)(out_len * repeat_count);
+        const float cycles_per_lut_sample_fixed = total_fixed / (float)(out_len * repeat_count);
+
+        ESP_LOGI(TAG, "Float : %.2f total cycles, %.2f cycles per sample, for %"PRId32" LUT samples, %"PRId32" output array length",
+                 cycles_float, cycles_per_lut_sample_float, lut_len, out_len);
+
+        ESP_LOGI(TAG, "Fixed : %.2f total cycles, %.2f cycles per sample, for %"PRId32" LUT samples, %"PRId32" output array length \n",
+                 cycles_fixed, cycles_per_lut_sample_fixed, lut_len, out_len);
+
+        out_len *= 2;
+    }
+
+    free(out_array_fixed);
+    free(out_array_float);
+    cplx_gen_free(&cplx_signal_float);
+    cplx_gen_free(&cplx_signal_fixed);
+}
+
+
+TEST_CASE("cplx_gen_noise_SNR_test", "[dsps]")
+{
+    const int32_t out_len = 2048;
+    const int32_t lut_len = 8192;
+    const int32_t n_fft = out_len * 2;      // * 2 (real and imaginary)
+    const float frequency = 0.01;
+    const float init_phase = 0.0;
+    const float real_ampl = 0.5;
+    const float imag_ampl = 0.2;
+
+    cplx_sig_t cplx_signal_float;
+
+    esp_err_t status = dsps_cplx_gen_init(&cplx_signal_float, F32_FLOAT, NULL, lut_len, frequency, init_phase);
+    error_msg_handler(&cplx_signal_float, status);
+
+    float *out_array_float = (float *)memalign(16, n_fft * sizeof(float));
+    dsps_cplx_gen(&cplx_signal_float, (void *)out_array_float, out_len);
+
+    // Signal windowing
+    float *window = (float *)memalign(16, out_len * sizeof(float));
+    dsps_wind_blackman_harris_f32(window, out_len);
+
+    for (int i = 0 ; i < out_len ; i++) {
+        out_array_float[i * 2 + 0] *= (window[i] * real_ampl);
+        out_array_float[i * 2 + 1] *= (window[i] * imag_ampl);
+    }
+    free(window);
+
+    // Initialize FFT
+    esp_err_t ret = dsps_fft2r_init_fc32(NULL, n_fft);
+    if (ret  != ESP_OK) {
+        ESP_LOGE(TAG, "Not possible to initialize FFT. Error = %i", ret);
+        return;
+    }
+
+    // Do the FFT
+    dsps_fft2r_fc32(out_array_float, out_len);
+    dsps_bit_rev_fc32(out_array_float, out_len);
+    dsps_cplx2reC_fc32(out_array_float, out_len);
+
+    // Convert the FFT spectrum from amplitude to watts, find the max value and its position
+    float max_val_1 = -1000000, max_val_2 = -1000000;
+    int max_pos_1 = 0, max_pos_2 = 0, spur_pos_1 = 0, spur_pos_2 = 0;
+    for (int i = 0 ; i < n_fft / 2 ; i++) {
+        out_array_float[i] = (out_array_float[i * 2 + 0] * out_array_float[i * 2 + 0] + out_array_float[i * 2 + 1] * out_array_float[i * 2 + 1]) / (n_fft * 3);
+        if (i < n_fft / 4) {
+            if (out_array_float[i] > max_val_1) {
+                max_val_1 = out_array_float[i];
+                max_pos_1 = i;
+            }
+        } else {
+            if (out_array_float[i] > max_val_2) {
+                max_val_2 = out_array_float[i];
+                max_pos_2 = i;
+            }
+        }
+    }
+
+    // Calculate the power of the signal and noise of the spectrum and convert the spectrum to dB
+    float signal_pow_1 = 0, signal_pow_2 = 0, noise_pow_1 = 0, noise_pow_2 = 0;
+    float spur_1 = -1000000, spur_2 = -1000000;
+    for (int i = 0 ; i < n_fft / 2 ; i++) {
+        if (i < n_fft / 4) {
+            if ((i >= max_pos_1 - LEAKAGE_BINS) && (i <= max_pos_1 + LEAKAGE_BINS)) {
+                signal_pow_1 += out_array_float[i];
+            } else {
+                noise_pow_1 += out_array_float[i];
+                if (out_array_float[i] > spur_1) {
+                    spur_1 = out_array_float[i];
+                    spur_pos_1 = i;
+                }
+            }
+        } else {
+            if ((i >= max_pos_2 - LEAKAGE_BINS) && (i <= max_pos_2 + LEAKAGE_BINS)) {
+                signal_pow_2 += out_array_float[i];
+            } else {
+                noise_pow_2 += out_array_float[i];
+                if (out_array_float[i] > spur_2) {
+                    spur_2 = out_array_float[i];
+                    spur_pos_2 = i;
+                }
+            }
+        }
+        out_array_float[i] = 10 * log10f(0.0000000000001 + out_array_float[i]);
+    }
+
+    // Convert the signal power and noise power from watts to dB and calculate SNR and SFDR
+    const float snr_1 = 10 * log10f(signal_pow_1 / noise_pow_1);
+    const float snr_2 = 10 * log10f(signal_pow_2 / noise_pow_2);
+    noise_pow_1 = 10 * log10f(noise_pow_1);
+    noise_pow_2 = 10 * log10f(noise_pow_2);
+    signal_pow_1 = 10 * log10f(signal_pow_1);
+    signal_pow_2 = 10 * log10f(signal_pow_2);
+    const float sfdr_1 = out_array_float[max_pos_1] - out_array_float[spur_pos_1];
+    const float sfdr_2 = out_array_float[max_pos_2] - out_array_float[spur_pos_2];
+
+    ESP_LOGI(TAG, "\nSignal Power: \t%f\nNoise Power: \t%f\nSNR: \t\t%f \nSFDR: \t\t%f", signal_pow_1, noise_pow_1, snr_1, sfdr_1);
+    dsps_view(out_array_float, n_fft / 4, 64, 16,  -140, 40, '|');
+    putchar('\n');
+
+    ESP_LOGI(TAG, "\nSignal Power: \t%f\nNoise Power: \t%f\nSNR: \t\t%f \nSFDR: \t\t%f", signal_pow_2, noise_pow_2, snr_2, sfdr_2);
+    dsps_view(out_array_float + (n_fft / 4), n_fft / 4, 64, 16,  -140, 40, '|');
+
+    free(out_array_float);
+    cplx_gen_free(&cplx_signal_float);
+}
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_cplx_gen.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_cplx_gen.h
@@ -0,0 +1,187 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef _dsps_cplx_gen_H_
+#define _dsps_cplx_gen_H_
+
+#include "dsp_err.h"
+#include "dsps_cplx_gen_platform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+/**
+ * @brief Ennum defining output data type of the complex generator
+ *
+ */
+typedef enum output_data_type {
+    S16_FIXED = 0,              /*!< Q15 fixed point - int16_t*/
+    F32_FLOAT = 1,              /*!< Single precision floating point - float*/
+} out_d_type;
+
+
+/**
+ * @brief Data struct of the complex signal generator
+ *
+ * This structure is used by a complex generator internally. A user should access this structure only in case of
+ * extensions for the DSP Library.
+ * All the fields of this structure are initialized by the dsps_cplx_gen_init(...) function.
+ */
+typedef struct cplx_sig_s {
+    void       *lut;            /*!< Pointer to the lookup table.*/
+    int32_t     lut_len;        /*!< Length of the lookup table.*/
+    float       freq;           /*!< Frequency of the output signal. Nyquist frequency -1 ... 1*/
+    float       phase;          /*!< Phase (initial_phase during init)*/
+    out_d_type  d_type;         /*!< Output data type*/
+    int16_t     free_status;    /*!< Indicator for cplx_gen_free(...) function*/
+} cplx_sig_t;
+
+
+/**
+ * @brief Initialize strucure for complex generator
+ *
+ * Function initializes a structure for either 16-bit fixed point, or 32-bit floating point complex generator using LUT table.
+ * cplx_gen_free(...) must be called, once the generator is not needed anymore to free dynamically allocated memory
+ *
+ * A user can specify his own LUT table and pass a pointer to the table (void *lut) during the initialization. If the LUT table
+ * pointer passed to the init function is a NULL, the LUT table is initialized internally.
+ *
+ * @param cplx_gen: pointer to the floating point generator structure
+ * @param d_type: output data type - out_d_type enum
+ * @param lut: pointer to a user-defined LUT, the data type is void so both (S16_FIXED, F32_FLOAT) types could be used
+ * @param lut_len: length of the LUT
+ * @param freq: Frequency of the output signal in a range of [-1...1], where 1 is a Nyquist frequency
+ * @param initial_phase: initial phase of the complex signal in range of [-1..1] where 1 is related to 2Pi and -1 is related to -2Pi
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_cplx_gen_init(cplx_sig_t *cplx_gen, out_d_type d_type, void *lut, int32_t lut_len, float freq, float initial_phase);
+
+
+/**
+ * @brief function sets the output frequency of the complex generator
+ *
+ * set function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
+ *
+ * @param cplx_gen: pointer to the complex signal generator structure
+ * @param freq: new frequency to be set in a range of [-1..1] where 1 is a Nyquist frequency
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - ESP_ERR_DSP_INVALID_PARAM if the frequency is out of the Nyquist frequency range
+ */
+esp_err_t dsps_cplx_gen_freq_set(cplx_sig_t *cplx_gen, float freq);
+
+
+/**
+ * @brief function gets the output frequency of the complex generator
+ *
+ * get function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
+ *
+ * @param cplx_gen: pointer to the complex signal generator structure
+ *
+ * @return function returns frequency of the signal generator
+ */
+float dsps_cplx_gen_freq_get(cplx_sig_t *cplx_gen);
+
+
+/**
+ * @brief function sets the phase of the complex generator
+ *
+ * set function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
+ *
+ * @param cplx_gen: pointer to the complex signal generator structure
+ * @param phase: new phase to be set in the range of [-1..1] where 1 is related to 2Pi and -1 is related to -2Pi
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - ESP_ERR_DSP_INVALID_PARAM if the phase is out of -1 ... 1 range
+ */
+esp_err_t dsps_cplx_gen_phase_set(cplx_sig_t *cplx_gen, float phase);
+
+
+/**
+ * @brief function gets the phase of the complex generator
+ *
+ * get function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
+ *
+ * @param cplx_gen: pointer to the complex signal generator structure
+ *
+ * @return function returns phase of the signal generator
+ */
+float dsps_cplx_gen_phase_get(cplx_sig_t *cplx_gen);
+
+
+/**
+ * @brief function sets the output frequency and the phase of the complex generator
+ *
+ * set function can be used after the cplx_gen structure was initialized by the dsps_cplx_gen_init(...) function
+ *
+ * @param cplx_gen: pointer to the complex signal generator structure
+ * @param freq: new frequency to be set in the range of [-1..1] where 1 is a Nyquist frequency
+ * @param phase: new phase to be set in the range of [-1..1] where 1 is related to 2Pi and -1 is related to -2Pi
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - ESP_ERR_DSP_INVALID_PARAM if the frequency is out of the Nyquist frequency range
+ *                                  if the phase is out of -1 ... 1 range
+ */
+esp_err_t dsps_cplx_gen_set(cplx_sig_t *cplx_gen, float freq, float phase);
+
+
+/**
+ * @brief function frees dynamically allocated memory, which was allocated in the init function
+ *
+ * free function must be called after the dsps_cplx_gen_init(...) is called, once the complex generator is not
+ * needed anymore
+ *
+ * @param cplx_gen: pointer to the complex signal generator structure
+ */
+void cplx_gen_free(cplx_sig_t *cplx_gen);
+
+
+/**
+ * @brief The function generates a complex signal
+ *
+ * the generated complex signal is in the form of two harmonics signals in either 16-bit signed fixed point
+ * or 32-bit floating point
+ *
+ * x[i]=   A*sin(step*i + ph/180*Pi)
+ * x[i+1]= B*cos(step*i + ph/180*Pi)
+ * where step = 2*Pi*frequency
+ *
+ * dsps_cplx_gen_ansi() - The implementation uses ANSI C and could be compiled and run on any platform
+ * dsps_cplx_gen_ae32() - Is targetted for Xtensa cores
+ *
+ * @param cplx_gen: pointer to the generator structure
+ * @param output: output array (length of len*2), data type is void so both (S16_FIXED, F32_FLOAT) types could be used
+ * @param len: length of the output signal
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_cplx_gen_ansi(cplx_sig_t *cplx_gen, void *output, int32_t len);
+esp_err_t dsps_cplx_gen_ae32(cplx_sig_t *cplx_gen, void *output, int32_t len);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#if (dsps_cplx_gen_ae32_enbled || dsps_cplx_gen_aes3_enbled)
+#define dsps_cplx_gen dsps_cplx_gen_ae32
+#else // CONFIG_DSP_OPTIMIZED
+#define dsps_cplx_gen dsps_cplx_gen_ansi
+#endif // CONFIG_DSP_OPTIMIZED
+
+#endif // _dsps_cplx_gen_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_cplx_gen_platform.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_cplx_gen_platform.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef _dsps_cplx_gen_platform_H_
+#define _dsps_cplx_gen_platform_H_
+
+#include "sdkconfig.h"
+
+#ifdef __XTENSA__
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+
+#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#define dsps_cplx_gen_aes3_enbled  1
+#define dsps_cplx_gen_ae32_enbled  0
+
+#elif CONFIG_IDF_TARGET_ESP32
+#define dsps_cplx_gen_ae32_enbled  1
+#define dsps_cplx_gen_aes3_enbled  0
+
+#endif // CONFIG_IDF_TARGET_ESP32S3 CONFIG_IDF_TARGET_ESP32
+#endif //
+#endif // __XTENSA__
+#endif // _dsps_cplx_gen_platform_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_d_gen.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_d_gen.h
@@ -0,0 +1,47 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dsps_d_gen_H_
+#define _dsps_d_gen_H_
+#include "dsp_err.h"
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief   delta function
+ *
+ * The function generate delta function.
+ * output[i]=0, if i=[0..N)
+ * output[i]=1, if i=pos, pos: [0..N-1)
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param output: output array.
+ * @param len: length of the input signal
+ * @param pos: delta function position
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_d_gen_f32(float *output, int len, int pos);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _dsps_d_gen_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_h_gen.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_h_gen.h
@@ -0,0 +1,48 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dsps_h_gen_H_
+#define _dsps_h_gen_H_
+#include "dsp_err.h"
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief   Heviside function
+ *
+ * The Heviside function.
+ * output[i]=0, if i=[0..pos)
+ * output[i]=1, if i=[pos..N)
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param output: output array.
+ * @param len: length of the input signal
+ * @param pos: heviside function position
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+
+esp_err_t dsps_h_gen_f32(float *output, int len, int pos);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _dsps_h_gen_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_sfdr.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_sfdr.h
@@ -0,0 +1,51 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dsps_sfdr_H_
+#define _dsps_sfdr_H_
+
+
+#include "dsp_err.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief   SFDR
+ *
+ * The function calculates Spurious-Free Dynamic Range.
+ * The function makes FFT of the input, then search a spectrum maximum, and then compare
+ * maximum value with all others. Result calculated as minimum value.
+ * This function have to be used for debug and unit tests only. It's not optimized for real-time processing.
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param[in] input: input array.
+ * @param len: length of the input signal
+ * @param use_dc: this parameter define will be DC value used for calculation or not.
+ *                0 - SNR will not include DC power
+ *                1 - SNR will include DC power
+ *
+ * @return
+ *      - SFDR in DB
+ */
+float dsps_sfdr_f32(const float *input, int32_t len, int8_t use_dc);
+float dsps_sfdr_fc32(const float *input, int32_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _dsps_sfdr_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_snr.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_snr.h
@@ -0,0 +1,51 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _DSP_SNR_H_
+#define _DSP_SNR_H_
+
+#include "dsp_err.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief   SNR
+ *
+ * The function calculates signal to noise ration in case if signal is sine tone.
+ * The function makes FFT of the input, then search a spectrum maximum, and then calculated
+ * SNR as sum of all harmonics to the maximum value.
+ * This function have to be used for debug and unit tests only. It's not optimized for real-time processing.
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param input: input array.
+ * @param len: length of the input signal
+ * @param use_dc: this parameter define will be DC value used for calculation or not.
+ *                0 - SNR will not include DC power
+ *                1 - SNR will include DC power
+ *
+ * @return
+ *      - SNR in dB
+ */
+float dsps_snr_f32(const float *input, int32_t len, uint8_t use_dc);
+float dsps_snr_fc32(const float *input, int32_t len);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _DSP_SNR_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_tone_gen.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_tone_gen.h
@@ -0,0 +1,48 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dsps_tone_gen_H_
+#define _dsps_tone_gen_H_
+#include "dsp_err.h"
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief   tone
+ *
+ * The function generate a tone signal.
+ * x[i]=A*sin(2*PI*i + ph/180*PI)
+ * The implementation use ANSI C and could be compiled and run on any platform
+ *
+ * @param output: output array.
+ * @param len: length of the input signal
+ * @param Ampl: amplitude
+ * @param freq: Naiquist frequency -1..1
+ * @param phase: phase in degree
+ *
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_tone_gen_f32(float *output, int len, float Ampl, float freq, float phase);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _dsps_tone_gen_H_
--- a/managed_components/espressif__esp-dsp/modules/support/include/dsps_view.h
+++ b/managed_components/espressif__esp-dsp/modules/support/include/dsps_view.h
@@ -0,0 +1,64 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _dsps_view_H_
+#define _dsps_view_H_
+
+#include "dsp_err.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**@{*/
+/**
+ * @brief   plot view
+ *
+ * Generic view function.
+ * This function takes input samples and show then in console view as a plot.
+ * The main purpose to give and draft debug information to the DSP developer.
+ *
+ * @param[in] data: array with input samples.
+ * @param len: length of the input array
+ * @param width: plot width in symbols
+ * @param height: plot height in lines
+ * @param min: minimum value that will be limited by Axis Y.
+ * @param max: maximum value that will be limited by Axis Y.
+ * @param view_char: character to draw the plot calues ('.' or '|' etc)
+ *
+ */
+void dsps_view(const float *data, int32_t len, int width, int height, float min, float max, char view_char);
+void dsps_view_s16(const int16_t *data, int32_t len, int width, int height, float min, float max, char view_char);
+/**@}*/
+
+/**
+ * @brief   spectrum view
+ *
+ * The view function to show spectrum values  in 64x10 screen.
+ * The function based on dsps_view.
+ *
+ * @param[in] data: array with input samples.
+ * @param len: length of the input array
+ * @param min: minimum value that will be limited by Axis Y.
+ * @param max: maximum value that will be limited by Axis Y.
+ *
+ */
+void dsps_view_spectrum(const float *data, int32_t len, float min, float max);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _dsps_view_H_
--- a/managed_components/espressif__esp-dsp/modules/support/mem/esp32s3/dsps_memcpy_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/support/mem/esp32s3/dsps_memcpy_aes3.S
@@ -0,0 +1,340 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_mem_platform.h"
+#if dsps_mem_aes3_enbled
+
+// This is memory access for ESP32S3 processor.
+    .text
+    .align  4
+    .global dsps_memcpy_aes3
+    .type   dsps_memcpy_aes3,@function
+// The function implements the following C code:
+// void *dsps_memcpy_aes3(void *arr_dest, const void *arr_src, size_t arr_len);
+
+// Input params                 Variables
+//
+// arr_dest - a2                loop_len    - a5, a6
+// arr_src  - a3                p_arr_des   - a7
+// arr_len  - a4                div_48      - a8
+//                              align_mask  - a9
+
+/*
+esp32s3 optimized memcpy function works with both, aligned and unaligned data.
+
+arr_dest aligned -->     - _main_loop_aligned, 32 bytes in one run through the cycle, only aligned data
+arr_src  aligned /       - Check modulos to finish copying the remaining data outside of the cycle
+                         - Modulo 8 and 16 - S3 instructions for aligned data, the rest of the modulos are generic
+
+arr_dest aligned --->    - _main_loop_unaligned, 48 bytes of source unaligned data in one run through the cycle,
+arr_src unaligned /        (the destination must always be aligned)
+                         - Check modulos to finish copying remaining data outside of the cycle
+                         - Modulo 32 and 16 - S3 instructions for unaligned data, the rest of the modulos are generic
+
+arr_dest unaligned ->    - First, use generic instructions to align the arr_dest data (keep increasing
+arr_src aligned   /        the arr_dest pointer until the pointer is aligned)
+                         - Once arr_dest is aligned treat the rest of the data as:
+                             either both aligned (if arr_src happens to be aligned after the arr_dest aligning),
+                             or as arr_dest aligned and arr_src unaligned
+                         - Continue as mentioned above
+
+arr_dest unaligned ->    - Very same approach as with arr_dest unaligned and arr_src aligned
+arr_src unaligned /
+
+if the arr_len is less than 16, jump to _less_than_16 label and copy data without any s3 instructions or cycles
+*/
+#define MEMCPY_OPTIMIZED    1           // Use optimized memcpy or ANSI memcpy
+#define TIE_ENABLE          0           // Put a dummy TIE instruction to the ANSI memcpy to induce TIE context saving
+
+dsps_memcpy_aes3:
+
+#if MEMCPY_OPTIMIZED
+
+    // S3 optimized version of the memcpy (with TIE instrucstions)
+
+    entry    a1,    32
+    mov      a7,    a2                              // a7 - save arr_dest pointer
+
+    blti     a4,    16,  _less_than_16
+
+    // arr_dest alignment check
+    movi.n  a9,    0xf                              // 0xf alignment mask
+    and     a13,   a9,  a2                          // 0xf AND arr_dest pointer
+    beqz    a13,   _arr_dest_aligned
+
+        movi.n  a14,   16                           // a14 - 16
+        sub     a13,   a14,   a13                   // a13 = 16 - unalignment
+        sub     a4,    a4,    a13                   // len = len - (16 - unalignment)
+
+        // Aligning the arr_dest
+        // keep copying until arr_dest is aligned
+
+        // Check modulo 8 of the unalignment, if - then copy 8 bytes
+        bbci a13,  3, _aligning_mod_8_check         // branch if 3-rd bit of unalignment a13 is clear
+            l32i.n      a15,  a3,  0                // load 32 bits from arr_src a3 to a15, offset 0
+            l32i.n      a14,  a3,  4                // load 32 bits from arr_src a3 to a14, offset 4
+            s32i.n      a15,  a2,  0                // save 32 bits from a15 to arr_dest a2, offset 0
+            s32i.n      a14,  a2,  4                // save 32 bits from a14 to arr_dest a2, offset 4
+            addi.n      a3,   a3,  8                // increment arr_src pointer by 8 bytes
+            addi.n      a2,   a2,  8                // increment arr_dest pointer by 8 bytes
+        _aligning_mod_8_check:
+
+        // Check modulo 4 of the unalignment, if - then copy 4 bytes
+        bbci a13, 2, _aligning_mod_4_check          // branch if 2-nd bit of unalignment a13 is clear
+            l32i.n      a15,  a3,  0                // load 32 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  4                // increment arr_src pointer by 4 bytes
+            s32i.n      a15,  a2,  0                // save 32 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  4                // increment arr_dest pointer by 4 bytes
+        _aligning_mod_4_check:
+
+        // Check modulo 2 of the unalignment, if - then copy 2 bytes
+        bbci a13, 1, _aligning_mod_2_check          // branch if 1-st bit of unalignment a13 is clear
+            l16ui       a15,  a3,  0                // load 16 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  2                // increment arr_src pointer by 2 bytes
+            s16i        a15,  a2,  0                // save 16 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  2                // increment arr_dest pointer by 2 bytes
+        _aligning_mod_2_check:
+
+        // Check modulo 1 of the unalignment, if - then copy 1 byte
+        bbci a13, 0, _arr_dest_aligned              // branch if 0-th bit of unalignment a13 is clear
+            l8ui        a15,  a3,  0                // load 8 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  1                // increment arr_src pointer by 1 byte
+            s8i         a15,  a2,  0                // save 8 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  1                // increment arr_dest pointer by 1 byte
+
+    _arr_dest_aligned:
+
+    // arr_src alignment check
+    and     a15,   a9,  a3                          // 0xf (alignment mask) AND arr_src pointer
+    beqz    a15,   _arr_src_aligned
+
+        // arr_src unaligned, arr_dest aligned (arr_des either aligned originally or modified to be aligned by the Aligning the arr_des routine)
+
+        // Calculate modulo for non-aligned data
+        movi     a8,  89478486                      // a8 - div_48 constant
+        muluh    a5,  a8,  a4                       // a5 - loop_len = arr_len / 48
+        movi     a9,  48                            // a9 - 48
+        mul16s   a8,  a9,  a5                       // a8 - 48 * loop_len
+        sub      a6,  a4,  a8                       // a6 - loop_len_MOD 48
+
+        ee.ld.128.usar.ip   q2,  a3,  16            // Preload from arr_src
+        ee.ld.128.usar.ip   q3,  a3,  16            // Preload from arr_src
+
+        // Main loop arr_src unaligned
+        loopnez a5, ._main_loop_unaligned           // 48 bytes in one loop
+            ee.src.q.ld.ip    q4,  a3,  16, q2, q3  // preload and shift from arr_src
+            ee.vst.128.ip     q2,  a2,  16          // store to aligned arr_dest
+            ee.src.q.ld.ip    q2,  a3,  16, q3, q4  // preload and shift from arr_src
+            ee.vst.128.ip     q3,  a2,  16          // store to aligned arr_dest
+            ee.src.q.ld.ip    q3,  a3,  16, q4, q2  // preload and shift from arr_src
+            ee.vst.128.ip     q4,  a2,  16          // store to aligned arr_dest
+        ._main_loop_unaligned:
+
+        // Finish the _main_loop_unaligned outside of the loop from Q registers preloads
+        // Check modulo 32 of the loop_len_MOD, if - then copy 32 bytes
+        bbci   a6,  5,   _unaligned_mod_32_check    // branch if 5-th bit of loop_len_MOD a6 is clear
+            ee.src.q.ld.ip    q4,  a3,  0,  q2, q3  // preload and shift from arr_src
+            ee.vst.128.ip     q2,  a2,  16          // store to aligned arr_dest
+            ee.src.q          q3,  q3,  q4          // final shift
+            ee.vst.128.ip     q3,  a2,  16          // store to aligned arr_dest
+            j _follow_unaligned
+        _unaligned_mod_32_check:
+
+        // Check modulo 16 of the loop_len_MOD, if - then copy 16 bytes
+        bbci   a6, 4,   _unaligned_mod_16_check     // branch if 4-th bit of loop_len_MOD a6 is clear
+            ee.src.q          q2,  q2,  q3          // final shift
+            ee.vst.128.ip     q2,  a2,  16          // store to aligned arr_dest
+            addi              a3,  a3, -16          // put arr_src pointer back
+            j _follow_unaligned
+        _unaligned_mod_16_check:
+
+        addi    a3, a3, -32                         // put arr_src pointer back
+
+        // Finish the _main_loop_unaligned outside of the loop
+        // Check modulo 8 of the loop_len_MOD, if - then copy 8 bytes
+        _follow_unaligned:
+        bbci a6, 3, _unaligned_mod_8_check          // branch if 3-rd bit of loop_len_MOD a6 is clear
+            l32i.n      a15,  a3,  0                // load 32 bits from arr_src a3 to a15, offset 0
+            l32i.n      a14,  a3,  4                // load 32 bits from arr_src a3 to a14, offset 4
+            s32i.n      a15,  a2,  0                // save 32 bits from a15 to arr_dest a2, offset 0
+            s32i.n      a14,  a2,  4                // save 32 bits from a14 to arr_dest a2, offset 4
+            addi.n      a3,   a3,  8                // increment arr_src pointer by 8 bytes
+            addi.n      a2,   a2,  8                // increment arr_dest pointer by 8 bytes
+        _unaligned_mod_8_check:
+
+        // Finish the rest of the data, as if the data were aligned, no S3 instructions will be used further after the jump
+        j _aligned_mod_8_check
+
+    // Both arrays (arr_src and arr_dest) aligned
+    _arr_src_aligned:
+
+    // Calculate modulo 32 for aligned data
+    srli    a5,    a4,   5                          // a5 - loop_len = arr_len / 32
+    slli    a6,    a5,   5
+    sub     a6,    a4,  a6                          // a6 - loop_len_MOD 32
+
+    // Main loop arr_src aligned
+    loopnez  a5, ._main_loop_aligned                // 32 bytes in one loop
+        ee.vld.128.ip    q0,  a3,  16               // load 16 bytes from arr_src to q0
+        ee.vld.128.ip    q1,  a3,  16               // load 16 bytes from arr_src to q1
+
+        ee.vst.128.ip    q0,  a2,  16               // save 16 bytes to arr_dest from q0
+        ee.vst.128.ip    q1,  a2,  16               // save 16 bytes to arr_dest from q1
+    ._main_loop_aligned:
+
+    // Modulo 32 check
+    beqz    a6,    _aligned_mod_32_check            // branch if mod_32 = 0
+
+        // finish the end of the array outside of the main loop
+        // Check modulo 16 of the loop_len_MOD, if - then copy 16 bytes
+        bbci  a6, 4,  _aligned_mod_16_check         // branch if 4-th bit of loop_len_MOD a6 is clear
+            ee.vld.128.ip    q0,  a3,  16           // load 128 bits from arr_src to q0, increase arr_src pointer by 16 bytes
+            ee.vst.128.ip    q0,  a2,  16           // save 128 bits to arr_dest from q0, increase arr_dest pointer by 16 bytes
+        _aligned_mod_16_check:
+
+        // Check modulo 8 of the loop_len_MOD, if - then copy 8 bytes
+        bbci a6, 3, _aligned_mod_8_check            // branch if 3-rd bit of loop_len_MOD a6 is clear
+            ee.vld.l.64.ip    q0,  a3,  8           // load lower 64 bits from arr_src a3 to q0, increase arr_src pointer by 8 bytes
+            ee.vst.l.64.ip    q0,  a2,  8           // save lower 64 bits from q0 to arr_dest a2, increase arr_dest pointer by 8 bytes
+        _aligned_mod_8_check:
+
+        // Check modulo 4 of the loop_len_MOD, if - then copy 4 bytes
+        bbci a6, 2, _aligned_mod_4_check            // branch if 2-nd bit of loop_len_MOD a6 is clear
+            l32i.n      a15,  a3,  0                // load 32 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  4                // increment arr_src pointer by 4 bytes
+            s32i.n      a15,  a2,  0                // save 32 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  4                // increment arr_dest pointer by 4 bytes
+        _aligned_mod_4_check:
+
+        // Check modulo 2 of the loop_len_MOD, if - then copy 2 bytes
+        bbci a6, 1, _aligned_mod_2_check            // branch if 1-st bit of loop_len_MOD a6 is clear
+            l16ui       a15,  a3,  0                // load 16 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  2                // increment arr_src pointer by 2 bytes
+            s16i        a15,  a2,  0                // save 16 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  2                // increment arr_dest pointer by 2 bytes
+        _aligned_mod_2_check:
+
+        // Check modulo 1 of the loop_len_MOD, if - then copy 1 byte
+        bbci a6, 0, _aligned_mod_32_check           // branch if 0-th bit of loop_len_MOD a6 is clear
+            l8ui        a15,  a3,  0                // load 8 bits from arr_src a3 to a15
+            s8i         a15,  a2,  0                // save 8 bits from a15 to arr_dest a2
+
+    _aligned_mod_32_check:
+
+    mov      a2,    a7                              // copy the initial arr_dest pointer from a7 to arr_dest a2
+    retw.n                                          // return
+
+    _less_than_16:
+
+        // If the length of the copied array is lower than 16, it is faster not to use esp32s3-optimized functions
+
+        // Check modulo 8 of the arr_len, if - then copy 8 bytes
+        bbci    a4,  3, _less_than_16_mod_8_check   // branch if 3-rd bit of arr_len a4 is clear
+            l32i.n      a15,  a3,  0                // load 32 bits from arr_src a3 to a15, offset 0
+            l32i.n      a14,  a3,  4                // load 32 bits from arr_src a3 to a14, offset 4
+            s32i.n      a15,  a2,  0                // save 32 bits from a15 to arr_dest a2, offset 0
+            s32i.n      a14,  a2,  4                // save 32 bits from a14 to arr_dest a2, offset 4
+            addi.n      a3,   a3,  8                // increment arr_src pointer by 8 bytes
+            addi.n      a2,   a2,  8                // increment arr_dest pointer by 8 bytes
+        _less_than_16_mod_8_check:
+
+        // Check modulo 4 of the arr_len, if - then copy 4 bytes
+        bbci a4, 2, _less_than_16_mod_4_check       // branch if 2-nd bit of arr_len a4 is clear
+            l32i.n      a15,  a3,  0                // load 32 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  4                // increment arr_src pointer by 4 bytes
+            s32i.n      a15,  a2,  0                // save 32 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  4                // increment arr_dest pointer by 4 bytes
+        _less_than_16_mod_4_check:
+
+        // Check modulo 2 of the arr_len, if - then copy 2 bytes
+        bbci a4, 1, _less_than_16_mod_2_check       // branch if 1-st bit of arr_len a4 is clear
+            l16ui       a15,  a3,  0                // load 16 bits from arr_src a3 to a15
+            addi.n      a3,   a3,  2                // increment arr_src pointer by 2 bytes
+            s16i        a15,  a2,  0                // save 16 bits from a15 to arr_dest a2
+            addi.n      a2,   a2,  2                // increment arr_dest pointer by 2 bytes
+        _less_than_16_mod_2_check:
+
+        // Check modulo 1 of the arr_len, if - then copy 1 byte
+        bbci a4, 0, _less_than_16_mod_1_check       // branch if 0-th bit of arr_len a4 is clear
+            l8ui        a15,  a3,  0                // load 8 bits from arr_src a3 to a15
+            s8i         a15,  a2,  0                // save 8 bits from a15 to arr_dest a2
+        _less_than_16_mod_1_check:
+
+    mov      a2,    a7                              // copy the initial arr_dest pointer from a7 to arr_dest a2
+    retw.n                                          // return
+
+
+#else   // MEMCPY_OPTIMIZED
+
+    // ansi version of the memcpy (without TIE instructions) for testing purposes
+
+    entry    a1,    32
+    mov      a7,    a2                              // a7 - save arr_dest pointer
+
+    srli     a5,    a4,   4                         // a5 - loop_len = arr_len / 16
+
+    // Run main loop which copies 16 bytes in one loop run
+    loopnez a5, ._ansi_loop
+        l32i.n      a15,  a3,  0                    // load 32 bits from arr_src a3 to a15
+        l32i.n      a14,  a3,  4                    // load 32 bits from arr_src a3 to a14
+        l32i.n      a13,  a3,  8                    // load 32 bits from arr_src a3 to a13
+        l32i.n      a12,  a3,  12                   // load 32 bits from arr_src a3 to a13
+        s32i.n      a15,  a2,  0                    // save 32 bits from a15 to arr_dest a2
+        s32i.n      a14,  a2,  4                    // save 32 bits from a14 to arr_dest a2
+        s32i.n      a13,  a2,  8                    // save 32 bits from a13 to arr_dest a2
+        s32i.n      a12,  a2,  12                   // save 32 bits from a13 to arr_dest a2
+        addi.n      a3,   a3,  16                   // increment arr_src pointer by 12 bytes
+        addi.n      a2,   a2,  16                   // increment arr_dest pointer by 12 bytes
+    ._ansi_loop:
+
+    // Finish the remaining bytes out of the loop
+    // Check modulo 8 of the arr_len, if - then copy 8 bytes
+    bbci a4, 3, _mod_8_check                        // branch if 2-nd bit of arr_len a4 is clear
+        l32i.n      a15,  a3,  0                    // load 32 bits from arr_src a3 to a15
+        l32i.n      a14,  a3,  4                    // load 32 bits from arr_src a3 to a15
+        s32i.n      a15,  a2,  0                    // save 32 bits from a15 to arr_dest a2
+        s32i.n      a14,  a2,  4                    // save 32 bits from a15 to arr_dest a2
+        addi.n      a3,   a3,  8                    // increment arr_src pointer by 4 bytes
+        addi.n      a2,   a2,  8                    // increment arr_dest pointer by 4 bytes
+    _mod_8_check:
+
+    // Check modulo 4 of the arr_len, if - then copy 4 bytes
+    bbci a4, 2, _mod_4_check                        // branch if 2-nd bit of arr_len a4 is clear
+        l32i.n      a15,  a3,  0                    // load 32 bits from arr_src a3 to a15
+        addi.n      a3,   a3,  4                    // increment arr_src pointer by 4 bytes
+        s32i.n      a15,  a2,  0                    // save 32 bits from a15 to arr_dest a2
+        addi.n      a2,   a2,  4                    // increment arr_dest pointer by 4 bytes
+    _mod_4_check:
+
+    // Check modulo 2 of the arr_len, if - then copy 2 bytes
+    bbci a4, 1, _mod_2_check                        // branch if 1-st bit of arr_len a4 is clear
+        l16ui       a15,  a3,  0                    // load 16 bits from arr_src a3 to a15
+        addi.n      a3,   a3,  2                    // increment arr_src pointer by 2 bytes
+        s16i        a15,  a2,  0                    // save 16 bits from a15 to arr_dest a2
+        addi.n      a2,   a2,  2                    // increment arr_dest pointer by 2 bytes
+    _mod_2_check:
+
+    // Check modulo 1 of the arr_len, if - then copy 1 byte
+    bbci a4, 0, _mod_1_check                        // branch if 0-th bit of arr_len a4 is clear
+        l8ui        a15,  a3,  0                    // load 8 bits from arr_src a3 to a15
+        s8i         a15,  a2,  0                    // save 8 bits from a15 to arr_dest a2
+    _mod_1_check:
+
+    // if arr_len is shorter than 16, skip adding TIE instruction, to fix the panic handler before the main_app() loads
+    blti    a4,    16, _less_than_16_1              // branch, if arr_len a4 is shorter than 16 bytes
+    #if TIE_ENABLE                                  // put dummy TIE instruction to induce TIE context saving
+        ee.zero.qacc                                // initialize q0 to zero (dummy instruction)
+    #else                   // TIE_ENABLE                                  
+        nop                                         // compensate one cycle, when TIE is disabled to get the same benchmark value
+    #endif                  // TIE_ENABLE
+    _less_than_16_1:
+
+    mov      a2,    a7                              // copy the initial arr_dest pointer from a7 to arr_dest a2
+    retw.n                                          // return
+
+#endif  // MEMCPY_OPTIMIZED
+
+#endif  // dsps_mem_aes3_enbled
--- a/managed_components/espressif__esp-dsp/modules/support/mem/esp32s3/dsps_memset_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/support/mem/esp32s3/dsps_memset_aes3.S
@@ -0,0 +1,248 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "dsps_mem_platform.h"
+#if dsps_mem_aes3_enbled
+
+// This is memory access for ESP32S3 processor.
+    .text
+    .align  4
+    .global dsps_memset_aes3
+    .type   dsps_memset_aes3,@function
+// The function implements the following C code:
+// void *dsps_memset_aes3(void *arr_dest, uint8_t set_val, size_t set_size);
+
+// Input params                 Variables
+//
+// arr_dest - a2                loop_len    - a5
+// set_val  - a3                p_arr_dest  - a8
+// set_size - a4                8_bit_set   - a7
+//                              16_bit_set  - a9
+//                              32_bit_set  - a10
+//                              align_mask  - a11
+
+/*
+esp32s3 optimized memset function works with both, aligned and unaligned data.
+
+arr_dest aligned         - _main_loop, 16 bytes in one loop, only aligned data
+                         - Check modulos to finish copying remaining data outside of the cycle
+                         - Modulo 8 - S3 instruction for aligned data, the rest of the modulos are generic
+
+arr_dest unaligned       - First, use generic instructions to align the arr_dest data (keep increasing 
+                           the arr_dest pointer until the pointer is aligned)
+                         - Once arr_dest is aligned treat the rest of the data as aligned, same as above
+
+if the set_size is less than 16, jump to _less_than_16 label and set data without any s3 instructions or cycles
+*/
+
+#define MEMSET_OPTIMIZED    1           // Use optimized memset or ansi memset
+#define TIE_ENABLE          0           // Put a dummy TIE instruction to ANSI memset to induce TIE context saving
+
+dsps_memset_aes3:
+
+#if MEMSET_OPTIMIZED
+
+    entry   a1,    32
+    mov     a8,    a2                               // a8 - save arr_dest pointer 
+    blti    a4,    16, _less_than_16                // set_size shorter than 16
+
+    movi.n  a7,    0xff                             // 0xff one-byte mask
+    movi.n  a11,   0xf                              // 0xf  alignment mask
+    and     a7,    a7,   a3                         // mask upper 24 bits of set_val a3
+
+    bnez.n  a7, _non_zero_constant
+        ee.zero.q  q0                               // initialize q0 to zero
+        movi.n     a9,  0                           // initialize (16_bit_set) a9 to zero
+        movi.n     a10, 0                           // initialize (32_bit_set) a10 to zero
+        j _q_reg_prepared
+
+    _non_zero_constant:
+        // Fill q register
+        slli    a6,    a7,   8                      // a6 - (masked)set_val << 8
+        or      a9,    a6,   a7                     // a9 - (masked)set_val << 8 + (masked)set_val
+                                                    // a9 - 16-bit set
+        slli    a15,    a9,   16                    // a15 - a9 << 16
+        or      a10,    a9,   a15                   // broadcast 8 bits from set_val a3 to 32 bits
+                                                    // a10 - 32-bit set
+        ee.movi.32.q   q0,   a10,  0                // fill q0 register from a10 by 32 bits
+        ee.movi.32.q   q0,   a10,  1
+        ee.movi.32.q   q0,   a10,  2
+        ee.movi.32.q   q0,   a10,  3
+
+    _q_reg_prepared:
+
+    // alignment check
+    and     a15,   a11,  a2                         // 0xf (alignment mask) AND arr_dest pointer
+    beqz    a15,   _arr_dest_aligned                // branch if a15 equals to zero
+
+        movi.n  a14,   16                           // a14 - 16
+        sub     a15,   a14,   a15                   // a15 = 16 - unalignment
+        sub     a4,    a4,    a15                   // len = len - (16 - unalignment)
+
+        // keep setting until arr_dest is aligned
+        // Check modulo 8 of the unalignment, if - then set 8 bytes
+        bbci    a15,  3, _aligning_mod_8_check      // branch if 3-rd bit of unalignment a15 is clear
+            s32i.n      a10,  a2,  0                // save 32 bits from a10 to arr_dest a2, offset 0 bytes
+            s32i.n      a10,  a2,  4                // save 32 bits from a10 to arr_dest a2, offset 4 bytes
+            addi.n      a2,   a2,  8                // increment arr_dest pointer by 8 bytes
+        _aligning_mod_8_check:
+
+        // Check modulo 4 of the unalignment, if - then set 4 bytes
+        bbci a15, 2, _aligning_mod_4_check          // branch if 2-nd bit unalignment a15 is clear
+            s32i.n      a10,  a2,  0                // save 32 bits from a10 to arr_dest a2, offset 0 bytes
+            addi.n      a2,   a2,  4                // increment arr_dest pointer by 4 bytes
+        _aligning_mod_4_check:
+
+        // Check modulo 2 of the unalignment, if - then set 2 bytes
+        bbci a15, 1, _aligning_mod_2_check          // branch if 1-st bit unalignment a15 is clear
+            s16i        a9,   a2,  0                // save 16 bits from a9 to arr_dest a2, offset 0 bytes
+            addi.n      a2,   a2,  2                // increment arr_dest pointer by 2 bytes
+        _aligning_mod_2_check:
+
+        // Check modulo 1 of the unalignment, if - then copy 1 byte
+        bbci a15, 0, _arr_dest_aligned              // branch if 0-th bit unalignment a15 is clear
+            s8i         a7,   a2,  0                // save 8 bits from a7 to arr_dest a2, offset 0 bytes
+            addi.n      a2,   a2,  1                // increment arr_dest pointer by 1 byte
+
+
+    _arr_dest_aligned:
+    // Calculate main loop_len
+    srli    a5,    a4,   4                          // a5 - loop_len = set_size / 16
+
+    // Main loop
+    loopnez  a5, ._main_loop                        // 16 bytes in one loop
+        ee.vst.128.ip q0, a2, 16                    // store 16 bytes from q0 to arr_dest a2
+    ._main_loop:
+
+    // Check modulo 8 of the set_size, if - then set 8 bytes
+    bbci a4, 3, _aligned_mod_8_check                // branch if 3-rd bit of set_size a4 is clear
+        ee.vst.l.64.ip    q0,  a2,  8               // save lower 64 bits from q0 to arr_dest a2, increase arr_dest pointer by 8 bytes
+    _aligned_mod_8_check:
+
+    // Check modulo 4 of the set_size, if - then set 4 bytes
+    bbci a4, 2, _aligned_mod_4_check                // branch if 2-nd bit of set_size a4 is clear
+        s32i.n      a10,  a2,  0                    // save 32 bits from a10 to arr_dest a2, offset 0 bytes
+        addi.n      a2,   a2,  4                    // increment arr_dest pointer by 4 bytes
+    _aligned_mod_4_check:
+
+    // Check modulo 2 of the set_size, if - then set 2 bytes
+    bbci a4, 1, _aligned_mod_2_check                // branch if 1-st bit of set_size a4 is clear
+        s16i        a9,   a2,  0                    // save 16 bits from a9 to arr_dest a2, offset 0 bytes
+        addi.n      a2,   a2,  2                    // increment arr_dest pointer by 2 bytes
+    _aligned_mod_2_check:
+
+    // Check modulo 1 of the set_size, if - then set 1 byte
+    bbci a4, 0, _aligned_mod_1_check                // branch if 0-th bit of set_size a4 is clear
+        s8i         a7,   a2,  0                    // save 8 bits from a7 to arr_dest a2, offset 0 bytes
+    _aligned_mod_1_check:
+
+    mov     a2,   a8                                // copy the initial arr_dest pointer from a8 to arr_dest a2
+    retw.n                                          // return
+
+    _less_than_16:
+
+        // make 16-byte set_val
+        slli    a6,    a3,   8                      // a6 - a3 (set_val) << 8
+        or      a7,    a6,   a3                     // a7 - a3 (set_val) << 8 + a3 (set_val)
+
+        // Check modulo 8 of the set_size, if - then set 8 bytes
+        bbci a4, 3, _less_than_16_mod_8_check       // branch if 3-rd bit of set_size a4 is clear
+            s16i        a7,  a2,  0                 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
+            s16i        a7,  a2,  2                 // save 16 bits from a7 to arr_dest a2, offset 2 bytes
+            s16i        a7,  a2,  4                 // save 16 bits from a7 to arr_dest a2, offset 4 bytes
+            s16i        a7,  a2,  6                 // save 16 bits from a7 to arr_dest a2, offset 6 bytes
+            addi.n      a2,  a2,  8                 // increment arr_dest pointer by 8 bytes
+        _less_than_16_mod_8_check:
+
+        // Check modulo 4 of the set_size, if - then set 4 bytes
+        bbci a4, 2, _less_than_16_mod_4_check       // branch if 2-nd bit of set_size a4 is clear
+            s16i        a7,  a2,  0                 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
+            s16i        a7,  a2,  2                 // save 16 bits from a7 to arr_dest a2, offset 2 bytes
+            addi.n      a2,  a2,  4                 // increment arr_dest pointer by 4 bytes
+        _less_than_16_mod_4_check:
+
+        // Check modulo 2 of the set_size, if - then set 2 bytes
+        bbci a4, 1, _less_than_16_mod_2_check       // branch if 1-st bit of set_size a4 is clear
+            s16i        a7,  a2,  0                 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
+            addi.n      a2,  a2,  2                 // increment arr_dest pointer by 2 bytes
+        _less_than_16_mod_2_check:
+
+        // Check modulo 1 of the set_size, if - then set 1 byte
+        bbci a4, 0, _less_than_16_mod_1_check       // branch if 0-th bit of set_size a4 is clear
+            s8i         a3,  a2,   0                // save 8 bits from a3 to arr_dest a2, offset 0 bytes
+        _less_than_16_mod_1_check:
+
+    mov     a2,   a8                                // copy the initial arr_dest pointer from a8 to arr_dest a2
+    retw.n                                          // return
+
+
+#else   // MEMSET_OPTIMIZED
+
+    // ansi version of the memset (without TIE instructions) for testing purposes
+
+    entry    a1,    32
+    mov      a8,    a2                              // a8 - save arr_dest pointer
+
+    movi.n  a7,    0xff                             // 0xff one-byte mask
+    and     a7,    a7,   a3                         // mask upper 24 bits of a3
+
+    slli    a6,    a7,   8                          // a6 - (masked)set_val << 8
+    or      a9,    a6,   a7                         // a9 - (masked)set_val << 8 + (masked)set_val
+                                                    // a9 - 16-bit set
+    slli    a15,    a9,   16                        // a15 - a9 << 16
+    or      a10,    a9,   a15                       // broadcast 8 bits from a3 to 32 bits 
+
+    srli    a5,    a4,   4                          // a5 - loop_len = arr_len / 16
+
+    // Run main loop which sets 16 bytes in one loop run
+    loopnez a5, ._ansi_loop
+        s32i.n      a10,  a2,  0                    // save 32 bits from a15 to arr_dest a2
+        s32i.n      a10,  a2,  4                    // save 32 bits from a14 to arr_dest a2
+        s32i.n      a10,  a2,  8                    // save 32 bits from a14 to arr_dest a2
+        s32i.n      a10,  a2,  12                   // save 32 bits from a14 to arr_dest a2
+        addi.n      a2,   a2,  16                   // increment arr_dest pointer by 8 bytes
+    ._ansi_loop:
+
+    // Finish the remaining bytes out of the loop
+    // Check modulo 8 of the arr_len, if - then set 8 bytes
+    bbci a4, 3, _mod_8_check                        // branch if 2-nd bit of arr_len is clear
+        s32i.n      a10,  a2,  0                    // save 32 bits from a10 to arr_dest a2, offset 0 bytes
+        s32i.n      a10,  a2,  4                    // save 32 bits from a10 to arr_dest a2, offset 0 bytes
+        addi.n      a2,   a2,  8                    // increment arr_dest pointer by 4 bytes
+    _mod_8_check:
+
+    // Check modulo 4 of the arr_len, if - then set 4 bytes
+    bbci a4, 2, _mod_4_check                        // branch if 2-nd bit of arr_len is clear
+        s32i.n      a10,  a2,  0                    // save 32 bits from a10 to arr_dest a2, offset 0 bytes
+        addi.n      a2,   a2,  4                    // increment arr_dest pointer by 4 bytes
+    _mod_4_check:
+
+    // Check modulo 2 of the arr_len, if - then set 2 bytes
+    bbci a4, 1, _mod_2_check                        // branch if 1-st bit of arr_len is clear
+        s16i        a9,  a2,  0                     // save 16 bits from a7 to arr_dest a2, offset 0 bytes
+        addi.n      a2,  a2,  2                     // increment arr_dest pointer by 2 bytes
+    _mod_2_check:
+
+    // Check modulo 1 of the arr_len, if - then set 1 byte
+    bbci a4, 0, _mod_1_check                        // branch if 0-th bit of arr_len is clear
+        s8i         a7,  a2,   0                    // save 8 bits from a3 to arr_dest a2, offset 0 bytes
+    _mod_1_check:
+
+    // if arr_len is shorter than 16, skip adding TIE instruction, to fix the panic handler before the main_app() loads
+    blti    a4,    16, _less_than_16_1              // set_size shorter than 16, to fix panic handler before main_app() load
+    #if TIE_ENABLE                                  // put dummy TIE instruction to induce TIE context saving
+        ee.zero.qacc                                // initialize q0 to zero
+    #else                   // TIE_ENABLE                                  
+        nop                                         // compensate one cycle, when TIE is disabled to get the same benchmark value
+    #endif                  // TIE_ENABLE
+    _less_than_16_1:
+
+    mov      a2,    a8                              // copy the initial arr_dest pointer from a8 to arr_dest a2
+    retw.n                                          // return
+
+#endif  // MEMSET_OPTIMIZED
+
+#endif  // dsps_mem_aes3_enbled
--- a/managed_components/espressif__esp-dsp/modules/support/mem/include/dsps_mem.h
+++ b/managed_components/espressif__esp-dsp/modules/support/mem/include/dsps_mem.h
@@ -0,0 +1,67 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef _dsps_mem_H_
+#define _dsps_mem_H_
+
+#include "dsp_err.h"
+#include "dsp_common.h"
+#include "dsps_mem_platform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**@{*/
+/**
+ *  @brief memory copy function using esp32s3 TIE
+ *
+ * The extension (_aes3) is optimized for esp32S3 chip.
+ *
+ * @param arr_dest: pointer to the destination array
+ * @param arr_src: pointer to the source array
+ * @param arr_len: count of bytes to be copied from arr_src to arr_dest
+ *
+ * @return: pointer to dest array
+ */
+void *dsps_memcpy_aes3(void *arr_dest, const void *arr_src, size_t arr_len);
+
+/**@{*/
+/**
+ *  @brief memory set function using esp32s3 TIE
+ *
+ * The extension (_aes3) is optimized for esp32S3 chip.
+ *
+ * @param arr_dest: pointer to the destination array
+ * @param set_val: byte value, the dest array will be set with
+ * @param set_size: count of bytes, the dest array will be set with
+ *
+ * @return: pointer to dest array
+ */
+void *dsps_memset_aes3(void *arr_dest, uint8_t set_val, size_t set_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if CONFIG_DSP_OPTIMIZED
+
+#if dsps_mem_aes3_enbled
+#define dsps_memcpy dsps_memcpy_aes3
+#define dsps_memset dsps_memset_aes3
+#else
+#define dsps_memcpy memcpy
+#define dsps_memset memset
+#endif
+
+#else // CONFIG_DSP_OPTIMIZED
+
+#define dsps_memcpy memcpy
+#define dsps_memset memset
+
+#endif // CONFIG_DSP_OPTIMIZED
+#endif // _dsps_mem_H_
--- a/managed_components/espressif__esp-dsp/modules/support/mem/include/dsps_mem_platform.h
+++ b/managed_components/espressif__esp-dsp/modules/support/mem/include/dsps_mem_platform.h
@@ -0,0 +1,21 @@
+#ifndef _dsps_mem_platform_H_
+#define _dsps_mem_platform_H_
+
+#include "sdkconfig.h"
+
+#ifdef __XTENSA__
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+
+#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#define dsps_mem_aes3_enbled  1
+#else
+#define dsps_mem_aes3_enbled  0
+#endif // CONFIG_IDF_TARGET_ESP32S3
+
+#endif //
+#endif // __XTENSA__
+#endif // _dsps_mem_platform_H_
--- a/managed_components/espressif__esp-dsp/modules/support/mem/test/test_dsps_memcpy_memset.c
+++ b/managed_components/espressif__esp-dsp/modules/support/mem/test/test_dsps_memcpy_memset.c
@@ -0,0 +1,728 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <malloc.h>
+#include <stdbool.h>
+#include <string.h>
+#include <inttypes.h>
+#include "unity.h"
+#include "esp_log.h"
+#include "esp_err.h"
+#include "esp_dsp.h"
+
+#include "dsps_mem.h"
+#include "dsp_tests.h"
+
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/semphr.h"
+#include "freertos/queue.h"
+#include "freertos/timers.h"
+#include "esp_task_wdt.h"
+
+#define CORNERS_CPY_SET_COUNT       200
+#define MEMCPY_REPORT_LEN           100
+#define MEMSET_REPORT_LEN           50
+#define CALL_REPEAT_COUNT           1000
+#define TEST_PINNED_NUM_TASKS       2
+#define TEST_PINNED_NUM_ITERS       2
+#define CPY_REPEAT_COUNT            500
+#define CPY_ITERS                   40
+#define AREA_LENGTH                 1024
+
+static const char *TAG = "dsps_mem_access";
+
+/*
+Test functionality of the memcpy and memset functions optimized for esp32s3
+
+Requires: esp32s3
+
+Purpose:
+    - Test that esp32s3 optimized memcpy and memset have the same functionality as the original memcpy and memset
+
+Procedure:
+    - Create 4 arrays, 2 source arrays (aligned and unaligned) and 2 destination arrays (aligned and unaligned)
+    - Initialize the destination arrays to 0, fill the source arrays with non-zero values
+    - Copy the desired length of content from the source array to the destination array using memcpy
+    - Compare the content of the destination array with the content of the source array
+    - Initialize the destination arrays to 0
+    - Repeat the 3 above steps for different copy lengths (especially corner conditions like copy 0, 1, 2... and N, N -1, N - 2.... bytes)
+      and following arrays alignments
+        - destination array 16-byte aligned, source array 16-byte aligned
+        - destination array unaligned,       source array 16-byte aligned
+        - destination array 16-byte aligned, source array unaligned
+        - destination array unaligned,       source array unaligned
+    - Set the desired length of the destination array using memset
+    - Compare the content of the destination array with the set constant
+    - Initialize the destination arrays to 0
+    - Repeat the 3 above steps for different set lengths (especially corner conditions like copy 0, 1, 2... and N, N -1, N - 2.... bytes)
+      and both alignments of the destination array (16-byte aligned or unaligned)
+    - Free the dynamic array
+*/
+
+TEST_CASE("dsps_memcpy_memset_aes3_functionality", "[dsps]")
+{
+    const size_t arr_len = 1024;
+    const uint8_t set_val = 0xaa;
+    const size_t full_count = arr_len;
+    const size_t canary_bytes = 16;                     // canary bytes to check a possibe overflow
+    const unsigned int align_combinations_cpy = 4;      // source and destination arrays aligned or unaligned combinations
+    const unsigned int align_combinations_set = 2;      // destination array aligned or unaligned
+
+    uint8_t *arr_dest_align = (uint8_t *)memalign(16, (arr_len + canary_bytes) * sizeof(uint8_t));
+    uint8_t *arr_src_align  = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
+
+    uint8_t *arr_dest_unalign = (uint8_t *)malloc((arr_len + canary_bytes) * sizeof(uint8_t));
+    uint8_t *arr_src_unalign  = (uint8_t *)malloc(arr_len * sizeof(uint8_t));
+    uint8_t *arr_dest = NULL, *arr_src = NULL;
+
+    for (int i = 0; i < arr_len; i++) {
+        ((uint8_t *)arr_src_align)[i] = (uint8_t)i;
+        ((uint8_t *)arr_src_unalign)[i] = (uint8_t)i;
+    }
+
+    // canary bytes
+    for (int i = arr_len; i < (arr_len + canary_bytes); i++) {
+        ((uint8_t *)arr_dest_align)[i] = 0;
+        ((uint8_t *)arr_dest_unalign)[i] = 0;
+    }
+
+    // aes3 memcpy functionality
+    for (int align = 0; align < align_combinations_cpy; align++) {                   // alinged and unaligned arrays test loop
+
+        size_t byte_count[2] = {0, full_count - CORNERS_CPY_SET_COUNT};         // amount of bytes to be copied
+
+        switch (align) {
+        case 0:                             // both 16-byte aligned
+            arr_src = arr_src_align;
+            arr_dest = arr_dest_align;
+            break;
+
+        case 1:                             // destination unaligned, source aligned
+            arr_src = arr_src_align;
+            arr_dest = arr_dest_unalign;
+            break;
+
+        case 2:                             // source unaligned, destination aligned
+            arr_src = arr_src_unalign;
+            arr_dest = arr_dest_align;
+            break;
+
+        case 3:                             // both unaligned
+            arr_src = arr_src_unalign;
+            arr_dest = arr_dest_unalign;
+            break;
+
+        default:                            // default - both aligned
+            arr_src = arr_src_align;
+            arr_dest = arr_dest_align;
+            break;
+        }
+
+        for (int var = 0; var < 2; var++) {                                     // test conrner conditions
+            for (int j = 0; j < CORNERS_CPY_SET_COUNT; j++) {                   // mem_set from 1 to CORNERS_CPY_SET_COUNT
+                //         from (full_count - CORNERS_CPY_SET_COUNT + 1) to full_count
+                for (int i = 0; i < full_count; i++) {                          // Destination array initializing
+                    ((uint8_t *)arr_dest)[i] = 0;
+                }
+
+                dsps_memcpy((void *)arr_dest, (void *)arr_src, ++byte_count[var]);
+
+                TEST_ASSERT_EQUAL_UINT8_ARRAY(arr_src, arr_dest, byte_count[var]);
+                if (byte_count[var] < arr_len) {
+                    TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[byte_count[var]], (arr_len - byte_count[var]));
+                }
+                TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[arr_len], canary_bytes);
+            }
+        }
+    }
+
+    // aes3 memset functionality
+    for (int align = 0; align < align_combinations_set; align++ ) {             // alinged and unaligned arrays test loop
+
+        size_t byte_count[2] = {0, full_count - CORNERS_CPY_SET_COUNT};         // amount of bytes to be copied
+        if (!align) {
+            arr_dest = arr_dest_align;
+        } else {
+            arr_dest = arr_dest_unalign;
+        }
+
+        for (int var = 0; var < 2; var++) {                                     // test conrner conditions
+            for (int j = 0; j < CORNERS_CPY_SET_COUNT; j++) {                   // mem_set from 1 to CORNERS_CPY_SET_COUNT
+                //         from (full_count - CORNERS_CPY_SET_COUNT + 1) to full_count
+                for (int i = 0; i < full_count; i++) {                          // Destination array initializing
+                    ((uint8_t *)arr_dest)[i] = 0;
+                }
+
+                dsps_memset((void *)arr_dest, set_val, ++byte_count[var]);
+
+                TEST_ASSERT_EACH_EQUAL_UINT8(set_val, arr_dest, byte_count[var]);
+                if (byte_count[var] < arr_len) {
+                    TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[byte_count[var]], (arr_len - byte_count[var]));
+                }
+                TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[arr_len], canary_bytes);
+            }
+        }
+    }
+
+    free(arr_dest_align);
+    free(arr_src_align);
+    free(arr_dest_unalign);
+    free(arr_src_unalign);
+}
+
+
+/*
+Test micro-benchmark of the memcpy and memset functions optimized for esp32s3 and esp32
+
+Requires: esp32s3
+
+Purpose:
+    - Test how fast the esp32s3 optimized memcpy and memset are compared to the esp32 optimized memcpy and memset
+
+Procedure:
+    - Create 2 unaligned arrays, source and destination array
+    - Copy the content of the source array to the destination array using esp32s3 memcpy N times, while counting CPU cycles
+    - Copy the content of the source array to the destination array using esp32 memcpy N times, while counting CPU cycles
+    - Set the destination array using esp32s3 memcpy N times, while counting CPU cycles
+    - Set the destination array using esp32 memcpy N times, while counting CPU cycles
+    - Calculate benchmarks
+    - Free both arrays
+*/
+
+TEST_CASE("dsps_memcpy_memset_aes3_benchmark", "[dsps]")
+{
+    const size_t area_len = AREA_LENGTH;                // full length of the area (in bytes)
+    const size_t full_count = sizeof(uint8_t) * area_len;
+    const uint8_t set_val = 0xee;                       // constant value, the destination array will be set with
+
+    uint8_t *arr_src = (uint8_t *)malloc(area_len * sizeof(uint8_t));
+    uint8_t *arr_dest = (uint8_t *)malloc(area_len * sizeof(uint8_t));
+
+    // Memcpy benchmark
+    const unsigned int start_aes3_memcpy = dsp_get_cpu_cycle_count();
+    for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+        dsps_memcpy((void *)arr_dest, (void *)arr_src, full_count);
+    }
+    const unsigned int end_aes3_memcpy = dsp_get_cpu_cycle_count();
+
+    const unsigned int start_ae32_memcpy = dsp_get_cpu_cycle_count();
+    for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+        memcpy((void *)arr_dest, (void *)arr_src, full_count);
+    }
+    const unsigned int end_ae32_memcpy = dsp_get_cpu_cycle_count();
+
+    const float aes3_cycles_memcpy = ((float)(end_aes3_memcpy - start_aes3_memcpy)) / CALL_REPEAT_COUNT;
+    const float ae32_cycles_memcpy = ((float)(end_ae32_memcpy - start_ae32_memcpy)) / CALL_REPEAT_COUNT;
+
+    ESP_LOGI(TAG, "Micro benchmark of memcpy for unaligned array of %"PRIu32" bytes", (uint32_t)full_count);
+    ESP_LOGI(TAG, "Not-optimized cycles = %.2f", ae32_cycles_memcpy);
+    ESP_LOGI(TAG, "S3  optimized cycles  = %.2f", aes3_cycles_memcpy);
+
+    // Memset benchmark
+    const unsigned int start_aes3_memset = dsp_get_cpu_cycle_count();
+    for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+        dsps_memset((void *)arr_dest, set_val, full_count);
+    }
+    const unsigned int end_aes3_memset = dsp_get_cpu_cycle_count();
+
+    const unsigned int start_ae32_memset = dsp_get_cpu_cycle_count();
+    for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+        memset((void *)arr_dest, set_val, full_count);
+    }
+    const unsigned int end_ae32_memset = dsp_get_cpu_cycle_count();
+
+    const float ae32_cycles_memset = ((float)(end_ae32_memset - start_ae32_memset)) / CALL_REPEAT_COUNT;
+    const float aes3_cycles_memset = ((float)(end_aes3_memset - start_aes3_memset)) / CALL_REPEAT_COUNT;
+
+    ESP_LOGI(TAG, "Micro benchmark of memset for unaligned array of %"PRIu32" bytes", (uint32_t)full_count);
+    ESP_LOGI(TAG, "Not-optimized cycles = %.2f", ae32_cycles_memset);
+    ESP_LOGI(TAG, "S3  optimized cycles  = %.2f", aes3_cycles_memset);
+
+    free(arr_src);
+    free(arr_dest);
+}
+
+
+/*
+Test micro-benchmark of the memcpy optimized for esp32s3 and esp32 and print a comparison report for copy lengths from
+1 to 200 bytes, where the difference between the two memcpys is not unanimous
+
+Requires: esp32s3
+
+Purpose:
+    - Test how fast the esp32s3 optimized memcpy is to the esp32 optimized memcpy
+
+Procedure:
+    - Create 2 aligned arrays, source and destination array
+    - Copy the content of the source array to the destination array using esp32s3 memcpy N times, while counting CPU cycles
+    - Copy the content of the source array to the destination array using esp32 memcpy N times, while counting CPU cycles
+    - Calculate benchmarks and save the result
+    - Repeat the 3 above steps for different copy lengths (from 1 to 200 bytes)
+      and following arrays alignments
+        - destination array 16-byte aligned, source array 16-byte aligned
+        - destination array unaligned,       source array 16-byte aligned
+        - destination array 16-byte aligned, source array unaligned
+        - destination array unaligned,       source array unaligned
+    - Print table of results
+    - Free dynamic arrays
+*/
+TEST_CASE("dsps_memcpy_benchmark_report", "[dsps]")
+{
+    unsigned int start_count, end_count;
+    const unsigned int align_combinations = 4;      // source and destination arrays aligned or unaligned combinations
+    const int32_t arr_len = 256;
+
+    uint8_t *arr_dest = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
+    uint8_t *arr_src  = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
+    uint8_t *arr_dest_align = NULL, *arr_src_align = NULL;
+
+    uint16_t **result_aes3 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *));    // 2D arrays result_aes3[align_combinations][MEMCPY_REPORT_LEN]
+    uint16_t **result_ae32 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *));    // 2D arrays result_ae32[align_combinations][MEMCPY_REPORT_LEN]
+
+    for (int i = 0; i < align_combinations; i++) {
+        result_aes3[i] = (uint16_t *)malloc(MEMCPY_REPORT_LEN * sizeof(uint16_t));
+        result_ae32[i] = (uint16_t *)malloc(MEMCPY_REPORT_LEN * sizeof(uint16_t));
+    }
+
+    for (int iter = 0; iter < align_combinations; iter++) {
+        switch (iter) {
+        case 0:                             // both 16-byte aligned
+            arr_dest_align = arr_dest;
+            arr_src_align = arr_src;
+            break;
+
+        case 1:                             // destination unaligned, source aligned
+            arr_dest_align = arr_dest + 1;
+            arr_src_align = arr_src;
+            break;
+
+        case 2:                             // source unaligned, destination aligned
+            arr_dest_align = arr_dest;
+            arr_src_align = arr_src + 1;
+            break;
+
+        case 3:                             // both unaligned
+            arr_dest_align = arr_dest + 1;
+            arr_src_align = arr_src + 1;
+            break;
+
+        default:                            // default - both aligned
+            arr_dest_align = arr_dest;
+            arr_src_align = arr_src;
+            break;
+        }
+
+        for (int cpy_amount = 1; cpy_amount <= MEMCPY_REPORT_LEN; cpy_amount++) {
+
+            start_count = dsp_get_cpu_cycle_count();
+            for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+                dsps_memcpy((void *)arr_dest_align, (void *)arr_src_align, cpy_amount);
+            }
+            end_count = dsp_get_cpu_cycle_count();
+            result_aes3[iter][cpy_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
+
+            start_count = dsp_get_cpu_cycle_count();
+            for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+                memcpy((void *)arr_dest_align, (void *)arr_src_align, cpy_amount);
+            }
+            end_count = dsp_get_cpu_cycle_count();
+            result_ae32[iter][cpy_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
+        }
+    }
+
+    ESP_LOGI(TAG, "Cycle counts for aligned/unaligned source/destination array using default xtensa memcpy and s3 optimized memcpy");
+    printf("\n\tdest aligned \tdest unaligned\tdest   aligned\tdest unaligned\n");
+    printf(  "\tsrc  aligned \tsrc    aligned\tsrc  unaligned\tsrc  unaligned\n\n");
+    printf(  "byte \taes3    ae32\taes3    ae32\taes3    ae32\taes3    ae32\n");
+
+    for (int i = 0; i < MEMCPY_REPORT_LEN; i++) {
+        printf("%d\t", i + 1);
+
+        for (int j = 0; j < align_combinations; j++) {
+            printf(" %d\t", result_aes3[j][i]);
+            printf(" %d\t", result_ae32[j][i]);
+        }
+        putchar('\n');
+    }
+
+    for (int i = 0; i < MEMCPY_REPORT_LEN; i++) {
+        for (int j = 0; j < align_combinations; j++) {
+            TEST_ASSERT_GREATER_OR_EQUAL((result_ae32[j][i]) / 4, result_aes3[j][i]);
+        }
+    }
+
+    free(arr_dest);
+    free(arr_src);
+    free(result_ae32);
+    free(result_aes3);
+}
+
+/*
+Test micro-benchmark of the memset optimized for esp32s3 and esp32 and print a comparison report for set lengths from
+1 to 200 bytes, where the difference between the two memsets is not unanimous
+
+Requires: esp32s3
+
+Purpose:
+    - Test how fast the esp32s3 optimized memset is compared to the esp32 optimized memset
+
+Procedure:
+    - Create 1 aligned array - destination array
+    - Set the destination array using esp32s3 memcpy N times, while counting CPU cycles
+    - Set the destination array using esp32 memcpy N times, while counting CPU cycles
+    - Calculate benchmarks and save the result
+    - Repeat the 3 above steps for different copy lengths (from 1 to 200 bytes)
+      and both destination arrays alignments (16-byte aligned and unaligned)
+    - Print table of results
+    - Free dynamic arrays
+*/
+TEST_CASE("dsps_memset_benchmark_report", "[dsps]")
+{
+    unsigned int start_count, end_count;
+    const unsigned int align_combinations = 2;  // destination arrays aligned or unaligned
+    const int32_t arr_len = 256;
+    const uint8_t set_val = 0xaa;
+
+    uint8_t *arr_dest = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
+    uint8_t *arr_dest_align = NULL;
+
+    uint16_t **result_aes3 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *));    // 2D arrays result_aes3[align_combinations][MEMSET_REPORT_LEN]
+    uint16_t **result_ae32 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *));    // 2D arrays result_ae32[align_combinations][MEMSET_REPORT_LEN]
+
+    for (int i = 0; i < align_combinations; i++) {
+        result_aes3[i] = (uint16_t *)malloc(MEMSET_REPORT_LEN * sizeof(uint16_t));
+        result_ae32[i] = (uint16_t *)malloc(MEMSET_REPORT_LEN * sizeof(uint16_t));
+    }
+
+    for (int iter = 0; iter < align_combinations; iter++) {
+
+        if (iter == 0) {
+            arr_dest_align = arr_dest;          // destination 16-byte aligned
+        } else {
+            arr_dest_align = arr_dest + 1;      // destination unaligned
+        }
+
+        for (int set_amount = 1; set_amount <= MEMSET_REPORT_LEN; set_amount++) {
+            start_count = dsp_get_cpu_cycle_count();
+            for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+                dsps_memset((void *)arr_dest_align, set_val, set_amount);
+            }
+            end_count = dsp_get_cpu_cycle_count();
+            result_aes3[iter][set_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
+
+            start_count = dsp_get_cpu_cycle_count();
+            for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
+                memset((void *)arr_dest_align, set_val, set_amount);
+            }
+            end_count = dsp_get_cpu_cycle_count();
+            result_ae32[iter][set_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
+        }
+    }
+
+    ESP_LOGI(TAG, "Cycle counts for aligned/unaligned destination array using default xtensa memcpy and s3 optimized memcpy");
+    printf("\n\tdest aligned \tdest unaligned\n\n");
+    printf(  "byte \taes3    ae32\taes3    ae32\n");
+
+    for (int i = 0; i < MEMSET_REPORT_LEN; i++) {
+        printf("%d\t", i + 1);
+
+        for (int j = 0; j < align_combinations; j++) {
+            printf(" %d\t", result_aes3[j][i]);
+            printf(" %d\t", result_ae32[j][i]);
+        }
+        putchar('\n');
+    }
+
+    for (int i = 0; i < MEMSET_REPORT_LEN; i++) {
+        for (int j = 0; j < align_combinations; j++) {
+            TEST_ASSERT_GREATER_OR_EQUAL((result_ae32[j][i]) / 8, result_aes3[j][i]);
+        }
+    }
+
+    free(arr_dest);
+    free(result_ae32);
+    free(result_aes3);
+}
+
+/*
+Test micro-benchmark of the memcpy and memset functions optimized for esp32s3, with task switching
+
+Requires: esp32s3
+
+Purpose:
+    - Test how fast the esp32s3 optimized memcpy and memset are while  running memset and memcpy in multiple tasks
+
+Procedure:
+    - Create 4 tasks - 2 tasks per each core. Tasks are pinned to cores and all the tasks are the same.
+    - Run the memcpy micro-benchmark routine (from the previous test case) in each of the tasks.
+    - Start all the tasks simultaneously
+    - Wait for the tasks to complete, then delete the tasks
+    - Get the benchmark result
+    - Repeat all the above steps with memset, instead of memcpy
+    - Free the created dynamic arrays
+*/
+
+typedef struct {
+    SemaphoreHandle_t semaphore;
+    uint8_t *arr_src;
+    uint8_t *arr_dest;
+    uint8_t set_val;
+    size_t area_len;
+    uint32_t mean_val_cpy;
+    uint32_t mean_val_set;
+} test_context_benchmark_t;
+
+
+static void pinned_task_benchmark_memcpy(void *arg)
+{
+    ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
+    test_context_benchmark_t *context = (test_context_benchmark_t *)arg;
+    long unsigned int cycles_acc = 0;
+    unsigned int start_memcpy_count, end_memcpy_count;
+
+    for (int j = 0; j < CPY_ITERS; j++) {
+        start_memcpy_count = dsp_get_cpu_cycle_count();
+        for (int i = 0; i < CPY_REPEAT_COUNT; i++) {
+            dsps_memcpy((void *)context->arr_dest, (void *)context->arr_src, context->area_len);
+        }
+        end_memcpy_count = dsp_get_cpu_cycle_count();
+        cycles_acc += (end_memcpy_count - start_memcpy_count);
+        vTaskDelay(1);  // Block to cause a context switch, forcing the TIE context to be saved
+    }
+
+    context->mean_val_cpy += (uint32_t)((cycles_acc / CPY_REPEAT_COUNT) / CPY_ITERS);
+
+    // Indicate done and wait to be deleted
+    xSemaphoreGive(context->semaphore);
+    vTaskSuspend(NULL);
+}
+
+
+static void pinned_task_benchmark_memset(void *arg)
+{
+    ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
+    test_context_benchmark_t *context = (test_context_benchmark_t *)arg;
+    long unsigned int cycles_acc = 0;
+    unsigned int start_memset_count, end_memset_count;
+
+    for (int j = 0; j < CPY_ITERS; j++) {
+        start_memset_count = dsp_get_cpu_cycle_count();
+        for (int i = 0; i < CPY_REPEAT_COUNT; i++) {
+            dsps_memset((void *)context->arr_dest, context->set_val, context->area_len);
+        }
+        end_memset_count = dsp_get_cpu_cycle_count();
+        cycles_acc += (end_memset_count - start_memset_count);
+        vTaskDelay(1);  // Block to cause a context switch, forcing the TIE context to be saved
+    }
+
+    context->mean_val_set += (uint32_t)((cycles_acc / CPY_REPEAT_COUNT) / CPY_ITERS);
+
+    // Indicate done and wait to be deleted
+    xSemaphoreGive(context->semaphore);
+    vTaskSuspend(NULL);
+}
+
+
+TEST_CASE("dsps_memset_memcpy_context_switch_benchmark", "[dsps]")
+{
+    test_context_benchmark_t test_context;
+    char task_name[10];
+
+    test_context.semaphore = xSemaphoreCreateCounting(configNUM_CORES * TEST_PINNED_NUM_TASKS, 0);
+    test_context.area_len = (size_t)AREA_LENGTH;
+    test_context.arr_dest = (uint8_t *)malloc(AREA_LENGTH * sizeof(uint8_t));
+    test_context.arr_src = (uint8_t *)malloc(AREA_LENGTH * sizeof(uint8_t));
+    test_context.set_val = 0xab;
+    test_context.mean_val_cpy = 0;
+    test_context.mean_val_set = 0;
+
+    static void (*pinned_functions[2])(void *);
+    pinned_functions[0] = pinned_task_benchmark_memcpy;
+    pinned_functions[1] = pinned_task_benchmark_memset;
+
+    TEST_ASSERT_NOT_EQUAL(NULL, test_context.semaphore);
+
+    for (int iter = 0; iter < TEST_PINNED_NUM_ITERS; iter++) {
+        TaskHandle_t task_handles[configNUM_CORES][TEST_PINNED_NUM_TASKS];
+
+        // Create test tasks for each core
+        for (int i = 0; i < configNUM_CORES; i++) {
+            for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
+                sprintf(task_name, "task %d-%d", i, j);
+                TEST_ASSERT_EQUAL(pdTRUE, xTaskCreatePinnedToCore(pinned_functions[iter], task_name, 4096,
+                                  &test_context, 10, &task_handles[i][j], i));
+            }
+        }
+
+        // Start the created tasks simultaneously
+        for (int i = 0; i < configNUM_CORES; i++) {
+            for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
+                xTaskNotifyGive(task_handles[i][j]);
+            }
+        }
+
+        // Wait for the tasks to complete
+        for (int i = 0; i < configNUM_CORES * TEST_PINNED_NUM_TASKS; i++) {
+            xSemaphoreTake(test_context.semaphore, portMAX_DELAY);
+        }
+
+        // Delete the tasks
+        for (int i = 0; i < configNUM_CORES; i++) {
+            for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
+                vTaskDelete(task_handles[i][j]);
+            }
+        }
+
+        vTaskDelay(10); // Short delay to allow idle task to be free task memory and TIE contexts
+    }
+
+    vSemaphoreDelete(test_context.semaphore);
+    free(test_context.arr_dest);
+    free(test_context.arr_src);
+
+    const uint32_t iterations = (uint32_t)(configNUM_CORES * TEST_PINNED_NUM_TASKS * CPY_REPEAT_COUNT * CPY_ITERS);
+    const uint32_t copy_mean_val =  (uint32_t)(test_context.mean_val_cpy / (configNUM_CORES * TEST_PINNED_NUM_TASKS));
+    const uint32_t set_mean_val = (uint32_t)(test_context.mean_val_set / (configNUM_CORES * TEST_PINNED_NUM_TASKS));
+
+    printf("\nOut of %"PRIu32" iterations, array len of %"PRIu32" bytes\n", iterations, (uint32_t)AREA_LENGTH);
+    printf("Memcpy cycles = %"PRIu32"\n", copy_mean_val);
+    printf("Memset cycles = %"PRIu32"\n", set_mean_val);
+}
+
+
+/*
+Test context switching for the TIE disabled and enabled
+
+Requires: esp32s3
+
+Purpose:
+    - Compare context switching between the tasks when TIE (esp32s3 instruction extension) is enabled and disabled to
+      see what is the switching time overhead for the TIE enabled
+
+Procedure:
+    - Create a timer, 1000 ms is used for this test, but the exact time is not crucial
+    - Create 4 tasks - 2 tasks per each core. Tasks are pinned to cores and all the tasks are the same
+    - Start the created tasks simultaneously, start the timer
+    - A task executes a single assembler instruction from the TIE, to induce the context switch
+    - As soon, as the instruction is executed, a context switch occurs
+    - A counter counts number or context switcher within the timer interval specified by the timer
+    - Wait for the timer to expire and terminate the tasks
+    - Get the number of task switches and delete all the tasks
+    - Repeat the 7 above steps with the created tasks executing a single generic Xtensa assembler instruction,
+      instead of the TIE instruction to get the switching overhead
+*/
+
+static bool timer_expired = false;
+static TimerHandle_t one_shot_timer = NULL;
+
+typedef struct {
+    SemaphoreHandle_t semaphore;
+    uint32_t switch_count_tie_on;
+    uint32_t switch_count_tie_off;
+} test_context_timing_t;
+
+// Taks pinned to a core, executing TIE instruction
+static void pinned_task_tie_on(void *arg)
+{
+    ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
+    test_context_timing_t *context = (test_context_timing_t *)arg;
+    vTaskDelay(1);
+
+    while (!timer_expired) {
+        asm volatile("ee.zero.q q0");
+        context->switch_count_tie_on++;
+        taskYIELD();            // Block to cause a context switch, forcing the TIE context to be saved
+    }
+    xSemaphoreGive(context->semaphore);
+    vTaskSuspend(NULL);
+}
+
+// Taks pinned to a core, executing generic Xtensa instruction
+static void pinned_task_tie_off(void *arg)
+{
+    ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
+    test_context_timing_t *context = (test_context_timing_t *)arg;
+    vTaskDelay(1);
+
+    while (!timer_expired) {
+        asm volatile("nop");
+        context->switch_count_tie_off++;
+        taskYIELD();            // Block to cause a context switch, forcing the context to be saved
+    }
+
+    xSemaphoreGive(context->semaphore);
+    vTaskSuspend(NULL);
+}
+
+static void context_switch_timer_callback(TimerHandle_t xTimer)
+{
+    timer_expired = true;
+}
+
+
+TEST_CASE("dsps_TIE_context_switch_timing", "[dsps]")
+{
+    test_context_timing_t test_context;
+    const TickType_t timer_period_ms = 1000;
+    char task_name[10];
+
+    test_context.semaphore = xSemaphoreCreateCounting(configNUM_CORES * TEST_PINNED_NUM_TASKS, 0);
+    test_context.switch_count_tie_off = 0;
+    test_context.switch_count_tie_on = 0;
+    TEST_ASSERT_NOT_EQUAL(NULL, test_context.semaphore);
+
+    static void (*pinned_functions[2])(void *);
+    pinned_functions[0] = pinned_task_tie_on;
+    pinned_functions[1] = pinned_task_tie_off;
+
+    one_shot_timer = xTimerCreate("timer", pdMS_TO_TICKS(timer_period_ms), pdFALSE, (void *)0, context_switch_timer_callback);
+
+    for (int iter = 0; iter < TEST_PINNED_NUM_ITERS; iter++) {
+        timer_expired = false;
+        TaskHandle_t task_handles[configNUM_CORES][TEST_PINNED_NUM_TASKS];
+
+        // Create test tasks for each core
+        for (int i = 0; i < configNUM_CORES; i++) {
+            for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
+                sprintf(task_name, "task %d-%d", i, j);
+                TEST_ASSERT_EQUAL(pdTRUE, xTaskCreatePinnedToCore(pinned_functions[iter], task_name, 4096,
+                                  &test_context, 1, &task_handles[i][j], i));
+            }
+        }
+
+        // Start the created tasks simultaneously
+        for (int i = 0; i < configNUM_CORES; i++) {
+            for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
+                xTaskNotifyGive(task_handles[i][j]);
+            }
+        }
+        xTimerStart(one_shot_timer, portMAX_DELAY);
+        vTaskDelay(1);
+
+        // Wait for the tasks to complete
+        for (int i = 0; i < configNUM_CORES * TEST_PINNED_NUM_TASKS; i++) {
+            xSemaphoreTake(test_context.semaphore, portMAX_DELAY);
+        }
+
+        // Delete the tasks
+        for (int i = 0; i < configNUM_CORES; i++) {
+            for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
+                vTaskDelete(task_handles[i][j]);
+            }
+        }
+        vTaskDelay(10); // Short delay to allow idle task to be free task memory and TIE contexts
+    }
+
+    vSemaphoreDelete(test_context.semaphore);
+
+    printf("\nContext switching count within %"PRIu32" ms nterval\n", (uint32_t)timer_period_ms);
+    printf("TIE enabled  %"PRIu32"\n", test_context.switch_count_tie_on);
+    printf("TIE disabled %"PRIu32"\n", test_context.switch_count_tie_off);
+
+    float overhead = (((float)test_context.switch_count_tie_off / (float)test_context.switch_count_tie_on) * 100) - 100;
+    printf("Switch overhead %.2f %%\n", overhead);
+}
--- a/managed_components/espressif__esp-dsp/modules/support/misc/dsps_d_gen.c
+++ b/managed_components/espressif__esp-dsp/modules/support/misc/dsps_d_gen.c
@@ -0,0 +1,30 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_d_gen.h"
+
+esp_err_t dsps_d_gen_f32(float *output, int len, int pos)
+{
+    if (pos >= len) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (pos <    0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    for (int i = 0 ; i < len ; i++) {
+        output[i] = 0;
+    }
+    output[pos] = 1;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/support/misc/dsps_h_gen.c
+++ b/managed_components/espressif__esp-dsp/modules/support/misc/dsps_h_gen.c
@@ -0,0 +1,32 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_h_gen.h"
+
+esp_err_t dsps_h_gen_f32(float *output, int len, int pos)
+{
+    if (pos >= len) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (pos <    0) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    for (int i = 0 ; i < pos ; i++) {
+        output[i] = 0;
+    }
+    for (int i = pos ; i < len ; i++) {
+        output[i] = 1;
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/support/misc/dsps_tone_gen.c
+++ b/managed_components/espressif__esp-dsp/modules/support/misc/dsps_tone_gen.c
@@ -0,0 +1,39 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_tone_gen.h"
+#include <math.h>
+
+esp_err_t dsps_tone_gen_f32(float *output, int len, float Ampl, float freq, float phase)
+{
+    if (freq >= 1) {
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+    if (freq <= -1) {
+        return ESP_ERR_DSP_INVALID_PARAM;
+    }
+    float ph = phase / 180 * M_PI;
+    float fr  = 2 * M_PI * freq;
+    for (int i = 0 ; i < len ; i++) {
+        output[i] = Ampl * sin(ph);
+        ph += fr;
+        if (ph > 2 * M_PI) {
+            ph -= 2 * M_PI;
+        }
+        if (ph < -2 * M_PI) {
+            ph += 2 * M_PI;
+        }
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/support/sfdr/float/dsps_sfdr_f32.cpp
+++ b/managed_components/espressif__esp-dsp/modules/support/sfdr/float/dsps_sfdr_f32.cpp
@@ -0,0 +1,74 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_sfdr.h"
+#include "dsps_fft2r.h"
+#include "dsp_common.h"
+#include <math.h>
+#include <limits>
+#include "esp_log.h"
+
+static const char *TAG = "sfdr";
+
+float dsps_sfdr_f32(const float *input, int32_t len, int8_t use_dc)
+{
+    if (!dsp_is_power_of_two(len)) {
+        return 0;
+    }
+
+    float *temp_array = new float[len * 2];
+    for (int i = 0 ; i < len ; i++) {
+        float wind = 0.5 * (1 - cosf(i * 2 * M_PI / (float)len));
+        temp_array[i * 2 + 0] = input[i] * wind;
+        temp_array[i * 2 + 1] = 0;
+    }
+    dsps_fft2r_init_fc32(NULL, CONFIG_DSP_MAX_FFT_SIZE);
+
+    dsps_fft2r_fc32_ansi(temp_array, len);
+    dsps_bit_rev_fc32_ansi(temp_array, len);
+
+    float min = std::numeric_limits<float>::max();
+    float max = std::numeric_limits<float>::min();
+    int max_pos = 0;
+    for (int i = 0 ; i < len / 2 ; i++) {
+        temp_array[i] = 10 * log10f(temp_array[i * 2 + 0] * temp_array[i * 2 + 0] + temp_array[i * 2 + 1] * temp_array[i * 2 + 1]);
+        if (temp_array[i] < min) {
+            min = temp_array[i];
+        }
+        if (temp_array[i] > max) {
+            max = temp_array[i];
+            max_pos = i;
+        }
+        ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB", i, temp_array[i]);
+    }
+    int start_pos = 0;
+    int wind_width = 5;
+    float min_diff = std::numeric_limits<float>::max();
+
+    if (use_dc == 0) {
+        start_pos = wind_width;
+    }
+    for (int i = start_pos ; i < len / 2 ; i++) {
+        if ((i < (max_pos - wind_width)) || (i > (max_pos + wind_width))) {
+            float diff = max - temp_array[i];
+            if (diff < min_diff) {
+                ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB, maX=%f, max_pos=%i", i, temp_array[i], max, max_pos);
+                min_diff = diff;
+            }
+        }
+    }
+
+    delete[] temp_array;
+    return min_diff;
+}
--- a/managed_components/espressif__esp-dsp/modules/support/sfdr/test/test_dsps_sfdr_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/support/sfdr/test/test_dsps_sfdr_f32.c
@@ -0,0 +1,43 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_view.h"
+#include "dsps_sfdr.h"
+#include "dsps_fft2r.h"
+
+
+static const char *TAG = "dsps_sfdr_f32";
+
+TEST_CASE("dsps_sfdr_f32 functionality", "[dsps]")
+{
+    int N = 512;
+    float *data = (float *)malloc(N * 2 * sizeof(float));
+    int check_bin = 32;
+    float sfdr_exp = 4;
+    for (int i = 0 ; i < N ; i++) {
+        data[i] = 4 * sinf(M_PI / N * check_bin * i) / (N / 2);
+        data[i] += sinf(M_PI / N * check_bin * i * 2) / (N / 2);
+    }
+
+    float sfdr = dsps_sfdr_f32(data, N, 1);
+    TEST_ASSERT_EQUAL( (int)20 * log10(sfdr_exp), (int)sfdr);
+    ESP_LOGI(TAG, "dsps_sfdr_f32 = %f dB", sfdr);
+    dsps_fft2r_deinit_fc32();
+    free(data);
+}
--- a/managed_components/espressif__esp-dsp/modules/support/snr/float/dsps_snr_f32.cpp
+++ b/managed_components/espressif__esp-dsp/modules/support/snr/float/dsps_snr_f32.cpp
@@ -0,0 +1,78 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_snr.h"
+#include "dsps_fft2r.h"
+#include "dsp_common.h"
+#include <math.h>
+#include <limits>
+#include "esp_log.h"
+
+static const char *TAG = "snr";
+
+float dsps_snr_f32(const float *input, int32_t len, uint8_t use_dc)
+{
+    if (!dsp_is_power_of_two(len)) {
+        return 0;
+    }
+
+    float *temp_array = new float[len * 2];
+    for (int i = 0 ; i < len ; i++) {
+        float wind = 0.5 * (1 - cosf(i * 2 * M_PI / (float)len));
+        temp_array[i * 2 + 0] = input[i] * wind;
+        temp_array[i * 2 + 1] = 0;
+    }
+    dsps_fft2r_init_fc32(NULL, CONFIG_DSP_MAX_FFT_SIZE);
+
+    dsps_fft2r_fc32_ansi(temp_array, len);
+    dsps_bit_rev_fc32_ansi(temp_array, len);
+
+    float min = std::numeric_limits<float>::max();
+    float max = std::numeric_limits<float>::min();
+    int max_pos = 0;
+    for (int i = 0 ; i < len / 2 ; i++) {
+        temp_array[i] = temp_array[i * 2 + 0] * temp_array[i * 2 + 0] + temp_array[i * 2 + 1] * temp_array[i * 2 + 1];
+        if (temp_array[i] < min) {
+            min = temp_array[i];
+        }
+        if (temp_array[i] > max) {
+            max = temp_array[i];
+            max_pos = i;
+        }
+        ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB", i, temp_array[i]);
+    }
+    int start_pos = 0;
+    int wind_width = 7;
+
+    if (use_dc == 0) {
+        start_pos = wind_width;
+    }
+    float noise_power = 0;
+    for (int i = start_pos ; i < len / 2 ; i++) {
+        if ((i < (max_pos - wind_width)) || (i > (max_pos + wind_width))) {
+            noise_power += temp_array[i];
+            ESP_LOGD(TAG, "FFT Data[%i] =%8.4f dB, maX=%f, max_pos=%i, noise_power=%f", i, temp_array[i], max, max_pos, noise_power);
+        }
+    }
+
+    delete[] temp_array;
+    noise_power += std::numeric_limits<float>::min();
+    if (noise_power < max * 0.00000000001) {
+        return 192;
+    }
+    float snr = max / noise_power;
+    float result = 10 * log10(max / noise_power) - 2; // 2 - window correction
+    ESP_LOGI(TAG, "SNR = %f, result=%f dB", snr, result);
+    return result;
+}
--- a/managed_components/espressif__esp-dsp/modules/support/snr/test/test_dsps_snr_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/support/snr/test/test_dsps_snr_f32.c
@@ -0,0 +1,44 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_view.h"
+#include "dsps_snr.h"
+#include "dsps_fft2r.h"
+
+
+static const char *TAG = "dsps_snr_f32";
+
+
+TEST_CASE("dsps_snr_f32 functionality", "[dsps]")
+{
+    int N = 512;
+    float *data = (float *)malloc(N * 2 * sizeof(float));
+    int check_bin = 32;
+    float snr_exp = 0.001;
+    for (int i = 0 ; i < N ; i++) {
+        data[i] = 1 * sinf(M_PI / N * check_bin * i) / (N / 2);
+        data[i] += 0.001 / N; //0.1*sinf(M_PI/N*check_bin*i*2)/(N/2);
+    }
+
+    float snr = dsps_snr_f32(data, N, 1);
+    TEST_ASSERT_EQUAL(-round(20 * log10(snr_exp) + 3), (int)round(snr));
+    ESP_LOGI(TAG, "dsps_snr_f32 = %f dB", snr);
+    dsps_fft2r_deinit_fc32();
+    free(data);
+}
--- a/managed_components/espressif__esp-dsp/modules/support/view/dsps_view.cpp
+++ b/managed_components/espressif__esp-dsp/modules/support/view/dsps_view.cpp
@@ -0,0 +1,120 @@
+#include "dsps_view.h"
+#include <math.h>
+#include "esp_log.h"
+#include <limits>
+#include <inttypes.h>
+
+
+void dsps_view(const float *data, int32_t len, int width, int height, float min, float max, char view_char)
+{
+    uint8_t *view_data = new uint8_t[width * height];
+    float *view_data_min = new float[width];
+    float *view_data_max = new float[width];
+    //
+
+    for (int y = 0; y < height ; y++) {
+        for (int x = 0 ; x < width ; x++) {
+            view_data[y * width + x] = ' ';
+        }
+    }
+    for (int i = 0 ; i < width ; i++) {
+        view_data_min[i] = max;
+        view_data_max[i] = min;
+    }
+    float x_step = (float)(width) / (float)len;
+    float y_step = (float)(height - 1) / (max - min);
+    float data_min = std::numeric_limits<float>::max();
+    float data_max = std::numeric_limits<float>::min();
+    int min_pos = 0;
+    int max_pos = 0;
+
+    for (int i = 0 ; i < len ; i++) {
+        int x_pos = i * x_step;
+        if (data[i] < view_data_min[x_pos]) {
+            view_data_min[x_pos] = data[i];
+        }
+        if (data[i] > view_data_max[x_pos]) {
+            view_data_max[x_pos] = data[i];
+        }
+
+        if (view_data_min[x_pos] < min) {
+            view_data_min[x_pos] = min;
+        }
+        if (view_data_max[x_pos] > max) {
+            view_data_max[x_pos] = max;
+        }
+        ESP_LOGD("view", "for i=%i, x_pos=%i,  max=%f, min=%f, data=%f", i, x_pos, view_data_min[x_pos], view_data_max[x_pos], data[i]);
+        if (data[i] > data_max) {
+            data_max = data[i];
+            max_pos = i;
+        }
+        if (data[i] < data_min) {
+            data_min = data[i];
+            min_pos = i;
+        }
+    }
+    ESP_LOGI("view", "Data min[%i] = %f, Data max[%i] = %f", min_pos, data_min, max_pos, data_max);
+    ESP_LOGD("view", "y_step = %f", y_step);
+    for (int x = 0 ; x < width ; x++) {
+        int y_count = (view_data_max[x] - view_data_min[x]) * y_step + 1;
+        ESP_LOGD("view", "For x= %i y_count=%i  ,min =%f, max=%f, ... ", x, y_count, view_data_min[x], view_data_max[x]);
+        for (int y = 0 ; y < y_count ; y++) {
+            int y_pos = (max - view_data_max[x]) * y_step + y;
+            ESP_LOGD("view", " %i, ", y_pos);
+            view_data[y_pos * width + x] = view_char;
+        }
+        ESP_LOGD("view", " ");
+    }
+
+    // Simple output
+    // for (int i=0 ; i< len ; i++)
+    // {
+    //     float x_step = (float)(width-1)/(float)len;
+    //     float y_step = (float)(height-1)/(max - min);
+    //     int x_pos = i*x_step;
+    //     int y_pos = data[i]*y_step;
+    //     if (data[i] >= max) y_pos = 0;
+    //     if (data[i] <= min) y_pos = height-1;
+    //     view_data[y_pos*width + x_pos] = view_char;
+    //     printf("For data[%i]=%f, x_pos%i, y_pos=%i\n", i, data[i], x_pos, y_pos);
+    // }
+    // printf("\n");
+    printf(" ");
+    for (int x = 0 ; x < width ; x++) {
+        printf("_");
+    }
+    printf("\n");
+    for (int y = 0; y < height ; y++) {
+        printf("%i", y % 10);
+        for (int x = 0 ; x < width ; x++) {
+            printf("%c", view_data[y * width + x]);
+        }
+        printf("|\n");
+    }
+    printf(" ");
+    for (int x = 0 ; x < width ; x++) {
+        printf("%i", x % 10);
+    }
+    printf("\n");
+    ESP_LOGI("view", "Plot: Length=%i, min=%f, max=%f", (int)len, min, max);
+    delete[] view_data;
+    delete[] view_data_min;
+    delete[] view_data_max;
+}
+
+void dsps_view_s16(const int16_t *data, int32_t len, int width, int height, float min, float max, char view_char)
+{
+    float *view_data = new float[len];
+    for (size_t i = 0; i < len; i++) {
+//        view_data[i] = ((float)data[i])/32768.0f;
+        view_data[i] = data[i];
+        view_data[i] /= 32768;
+    }
+    dsps_view(view_data, len, width, height, min, max, view_char);
+    delete[] view_data;
+}
+
+void dsps_view_spectrum(const float *data, int32_t len, float min, float max)
+{
+    dsps_view(data, len, 64, 10, min, max, '|');
+}
--- a/managed_components/espressif__esp-dsp/modules/support/view/test/test_dsps_view.c
+++ b/managed_components/espressif__esp-dsp/modules/support/view/test/test_dsps_view.c
@@ -0,0 +1,36 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dsps_view.h"
+
+
+static const char *TAG = "dsps_view";
+
+TEST_CASE("dsps_view functionality", "[dsps]")
+{
+    float *data = (float *)malloc(1024 * sizeof(float));
+    for (int i = 0 ; i < 1024 ; i++) {
+        data[i] = -100;
+    }
+    data[256] = 0;
+    dsps_view_spectrum(data, 1024, -100, 0);
+
+    ESP_LOGI(TAG, "Just a check\n");
+    free(data);
+}