add some code
This commit is contained in:
@@ -0,0 +1,340 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "dsps_mem_platform.h"
|
||||
#if dsps_mem_aes3_enbled
|
||||
|
||||
// This is memory access for ESP32S3 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_memcpy_aes3
|
||||
.type dsps_memcpy_aes3,@function
|
||||
// The function implements the following C code:
|
||||
// void *dsps_memcpy_aes3(void *arr_dest, const void *arr_src, size_t arr_len);
|
||||
|
||||
// Input params Variables
|
||||
//
|
||||
// arr_dest - a2 loop_len - a5, a6
|
||||
// arr_src - a3 p_arr_des - a7
|
||||
// arr_len - a4 div_48 - a8
|
||||
// align_mask - a9
|
||||
|
||||
/*
|
||||
esp32s3 optimized memcpy function works with both, aligned and unaligned data.
|
||||
|
||||
arr_dest aligned --> - _main_loop_aligned, 32 bytes in one run through the cycle, only aligned data
|
||||
arr_src aligned / - Check modulos to finish copying the remaining data outside of the cycle
|
||||
- Modulo 8 and 16 - S3 instructions for aligned data, the rest of the modulos are generic
|
||||
|
||||
arr_dest aligned ---> - _main_loop_unaligned, 48 bytes of source unaligned data in one run through the cycle,
|
||||
arr_src unaligned / (the destination must always be aligned)
|
||||
- Check modulos to finish copying remaining data outside of the cycle
|
||||
- Modulo 32 and 16 - S3 instructions for unaligned data, the rest of the modulos are generic
|
||||
|
||||
arr_dest unaligned -> - First, use generic instructions to align the arr_dest data (keep increasing
|
||||
arr_src aligned / the arr_dest pointer until the pointer is aligned)
|
||||
- Once arr_dest is aligned treat the rest of the data as:
|
||||
either both aligned (if arr_src happens to be aligned after the arr_dest aligning),
|
||||
or as arr_dest aligned and arr_src unaligned
|
||||
- Continue as mentioned above
|
||||
|
||||
arr_dest unaligned -> - Very same approach as with arr_dest unaligned and arr_src aligned
|
||||
arr_src unaligned /
|
||||
|
||||
if the arr_len is less than 16, jump to _less_than_16 label and copy data without any s3 instructions or cycles
|
||||
*/
|
||||
#define MEMCPY_OPTIMIZED 1 // Use optimized memcpy or ANSI memcpy
|
||||
#define TIE_ENABLE 0 // Put a dummy TIE instruction to the ANSI memcpy to induce TIE context saving
|
||||
|
||||
dsps_memcpy_aes3:
|
||||
|
||||
#if MEMCPY_OPTIMIZED
|
||||
|
||||
// S3 optimized version of the memcpy (with TIE instrucstions)
|
||||
|
||||
entry a1, 32
|
||||
mov a7, a2 // a7 - save arr_dest pointer
|
||||
|
||||
blti a4, 16, _less_than_16
|
||||
|
||||
// arr_dest alignment check
|
||||
movi.n a9, 0xf // 0xf alignment mask
|
||||
and a13, a9, a2 // 0xf AND arr_dest pointer
|
||||
beqz a13, _arr_dest_aligned
|
||||
|
||||
movi.n a14, 16 // a14 - 16
|
||||
sub a13, a14, a13 // a13 = 16 - unalignment
|
||||
sub a4, a4, a13 // len = len - (16 - unalignment)
|
||||
|
||||
// Aligning the arr_dest
|
||||
// keep copying until arr_dest is aligned
|
||||
|
||||
// Check modulo 8 of the unalignment, if - then copy 8 bytes
|
||||
bbci a13, 3, _aligning_mod_8_check // branch if 3-rd bit of unalignment a13 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15, offset 0
|
||||
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14, offset 4
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2, offset 0
|
||||
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2, offset 4
|
||||
addi.n a3, a3, 8 // increment arr_src pointer by 8 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
|
||||
_aligning_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the unalignment, if - then copy 4 bytes
|
||||
bbci a13, 2, _aligning_mod_4_check // branch if 2-nd bit of unalignment a13 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_aligning_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the unalignment, if - then copy 2 bytes
|
||||
bbci a13, 1, _aligning_mod_2_check // branch if 1-st bit of unalignment a13 is clear
|
||||
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
|
||||
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_aligning_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the unalignment, if - then copy 1 byte
|
||||
bbci a13, 0, _arr_dest_aligned // branch if 0-th bit of unalignment a13 is clear
|
||||
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 1 // increment arr_src pointer by 1 byte
|
||||
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 1 // increment arr_dest pointer by 1 byte
|
||||
|
||||
_arr_dest_aligned:
|
||||
|
||||
// arr_src alignment check
|
||||
and a15, a9, a3 // 0xf (alignment mask) AND arr_src pointer
|
||||
beqz a15, _arr_src_aligned
|
||||
|
||||
// arr_src unaligned, arr_dest aligned (arr_des either aligned originally or modified to be aligned by the Aligning the arr_des routine)
|
||||
|
||||
// Calculate modulo for non-aligned data
|
||||
movi a8, 89478486 // a8 - div_48 constant
|
||||
muluh a5, a8, a4 // a5 - loop_len = arr_len / 48
|
||||
movi a9, 48 // a9 - 48
|
||||
mul16s a8, a9, a5 // a8 - 48 * loop_len
|
||||
sub a6, a4, a8 // a6 - loop_len_MOD 48
|
||||
|
||||
ee.ld.128.usar.ip q2, a3, 16 // Preload from arr_src
|
||||
ee.ld.128.usar.ip q3, a3, 16 // Preload from arr_src
|
||||
|
||||
// Main loop arr_src unaligned
|
||||
loopnez a5, ._main_loop_unaligned // 48 bytes in one loop
|
||||
ee.src.q.ld.ip q4, a3, 16, q2, q3 // preload and shift from arr_src
|
||||
ee.vst.128.ip q2, a2, 16 // store to aligned arr_dest
|
||||
ee.src.q.ld.ip q2, a3, 16, q3, q4 // preload and shift from arr_src
|
||||
ee.vst.128.ip q3, a2, 16 // store to aligned arr_dest
|
||||
ee.src.q.ld.ip q3, a3, 16, q4, q2 // preload and shift from arr_src
|
||||
ee.vst.128.ip q4, a2, 16 // store to aligned arr_dest
|
||||
._main_loop_unaligned:
|
||||
|
||||
// Finish the _main_loop_unaligned outside of the loop from Q registers preloads
|
||||
// Check modulo 32 of the loop_len_MOD, if - then copy 32 bytes
|
||||
bbci a6, 5, _unaligned_mod_32_check // branch if 5-th bit of loop_len_MOD a6 is clear
|
||||
ee.src.q.ld.ip q4, a3, 0, q2, q3 // preload and shift from arr_src
|
||||
ee.vst.128.ip q2, a2, 16 // store to aligned arr_dest
|
||||
ee.src.q q3, q3, q4 // final shift
|
||||
ee.vst.128.ip q3, a2, 16 // store to aligned arr_dest
|
||||
j _follow_unaligned
|
||||
_unaligned_mod_32_check:
|
||||
|
||||
// Check modulo 16 of the loop_len_MOD, if - then copy 16 bytes
|
||||
bbci a6, 4, _unaligned_mod_16_check // branch if 4-th bit of loop_len_MOD a6 is clear
|
||||
ee.src.q q2, q2, q3 // final shift
|
||||
ee.vst.128.ip q2, a2, 16 // store to aligned arr_dest
|
||||
addi a3, a3, -16 // put arr_src pointer back
|
||||
j _follow_unaligned
|
||||
_unaligned_mod_16_check:
|
||||
|
||||
addi a3, a3, -32 // put arr_src pointer back
|
||||
|
||||
// Finish the _main_loop_unaligned outside of the loop
|
||||
// Check modulo 8 of the loop_len_MOD, if - then copy 8 bytes
|
||||
_follow_unaligned:
|
||||
bbci a6, 3, _unaligned_mod_8_check // branch if 3-rd bit of loop_len_MOD a6 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15, offset 0
|
||||
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14, offset 4
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2, offset 0
|
||||
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2, offset 4
|
||||
addi.n a3, a3, 8 // increment arr_src pointer by 8 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
|
||||
_unaligned_mod_8_check:
|
||||
|
||||
// Finish the rest of the data, as if the data were aligned, no S3 instructions will be used further after the jump
|
||||
j _aligned_mod_8_check
|
||||
|
||||
// Both arrays (arr_src and arr_dest) aligned
|
||||
_arr_src_aligned:
|
||||
|
||||
// Calculate modulo 32 for aligned data
|
||||
srli a5, a4, 5 // a5 - loop_len = arr_len / 32
|
||||
slli a6, a5, 5
|
||||
sub a6, a4, a6 // a6 - loop_len_MOD 32
|
||||
|
||||
// Main loop arr_src aligned
|
||||
loopnez a5, ._main_loop_aligned // 32 bytes in one loop
|
||||
ee.vld.128.ip q0, a3, 16 // load 16 bytes from arr_src to q0
|
||||
ee.vld.128.ip q1, a3, 16 // load 16 bytes from arr_src to q1
|
||||
|
||||
ee.vst.128.ip q0, a2, 16 // save 16 bytes to arr_dest from q0
|
||||
ee.vst.128.ip q1, a2, 16 // save 16 bytes to arr_dest from q1
|
||||
._main_loop_aligned:
|
||||
|
||||
// Modulo 32 check
|
||||
beqz a6, _aligned_mod_32_check // branch if mod_32 = 0
|
||||
|
||||
// finish the end of the array outside of the main loop
|
||||
// Check modulo 16 of the loop_len_MOD, if - then copy 16 bytes
|
||||
bbci a6, 4, _aligned_mod_16_check // branch if 4-th bit of loop_len_MOD a6 is clear
|
||||
ee.vld.128.ip q0, a3, 16 // load 128 bits from arr_src to q0, increase arr_src pointer by 16 bytes
|
||||
ee.vst.128.ip q0, a2, 16 // save 128 bits to arr_dest from q0, increase arr_dest pointer by 16 bytes
|
||||
_aligned_mod_16_check:
|
||||
|
||||
// Check modulo 8 of the loop_len_MOD, if - then copy 8 bytes
|
||||
bbci a6, 3, _aligned_mod_8_check // branch if 3-rd bit of loop_len_MOD a6 is clear
|
||||
ee.vld.l.64.ip q0, a3, 8 // load lower 64 bits from arr_src a3 to q0, increase arr_src pointer by 8 bytes
|
||||
ee.vst.l.64.ip q0, a2, 8 // save lower 64 bits from q0 to arr_dest a2, increase arr_dest pointer by 8 bytes
|
||||
_aligned_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the loop_len_MOD, if - then copy 4 bytes
|
||||
bbci a6, 2, _aligned_mod_4_check // branch if 2-nd bit of loop_len_MOD a6 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_aligned_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the loop_len_MOD, if - then copy 2 bytes
|
||||
bbci a6, 1, _aligned_mod_2_check // branch if 1-st bit of loop_len_MOD a6 is clear
|
||||
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
|
||||
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_aligned_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the loop_len_MOD, if - then copy 1 byte
|
||||
bbci a6, 0, _aligned_mod_32_check // branch if 0-th bit of loop_len_MOD a6 is clear
|
||||
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
|
||||
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
|
||||
|
||||
_aligned_mod_32_check:
|
||||
|
||||
mov a2, a7 // copy the initial arr_dest pointer from a7 to arr_dest a2
|
||||
retw.n // return
|
||||
|
||||
_less_than_16:
|
||||
|
||||
// If the length of the copied array is lower than 16, it is faster not to use esp32s3-optimized functions
|
||||
|
||||
// Check modulo 8 of the arr_len, if - then copy 8 bytes
|
||||
bbci a4, 3, _less_than_16_mod_8_check // branch if 3-rd bit of arr_len a4 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15, offset 0
|
||||
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14, offset 4
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2, offset 0
|
||||
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2, offset 4
|
||||
addi.n a3, a3, 8 // increment arr_src pointer by 8 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
|
||||
_less_than_16_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the arr_len, if - then copy 4 bytes
|
||||
bbci a4, 2, _less_than_16_mod_4_check // branch if 2-nd bit of arr_len a4 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_less_than_16_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the arr_len, if - then copy 2 bytes
|
||||
bbci a4, 1, _less_than_16_mod_2_check // branch if 1-st bit of arr_len a4 is clear
|
||||
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
|
||||
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_less_than_16_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the arr_len, if - then copy 1 byte
|
||||
bbci a4, 0, _less_than_16_mod_1_check // branch if 0-th bit of arr_len a4 is clear
|
||||
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
|
||||
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
|
||||
_less_than_16_mod_1_check:
|
||||
|
||||
mov a2, a7 // copy the initial arr_dest pointer from a7 to arr_dest a2
|
||||
retw.n // return
|
||||
|
||||
|
||||
#else // MEMCPY_OPTIMIZED
|
||||
|
||||
// ansi version of the memcpy (without TIE instructions) for testing purposes
|
||||
|
||||
entry a1, 32
|
||||
mov a7, a2 // a7 - save arr_dest pointer
|
||||
|
||||
srli a5, a4, 4 // a5 - loop_len = arr_len / 16
|
||||
|
||||
// Run main loop which copies 16 bytes in one loop run
|
||||
loopnez a5, ._ansi_loop
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
|
||||
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a14
|
||||
l32i.n a13, a3, 8 // load 32 bits from arr_src a3 to a13
|
||||
l32i.n a12, a3, 12 // load 32 bits from arr_src a3 to a13
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
s32i.n a14, a2, 4 // save 32 bits from a14 to arr_dest a2
|
||||
s32i.n a13, a2, 8 // save 32 bits from a13 to arr_dest a2
|
||||
s32i.n a12, a2, 12 // save 32 bits from a13 to arr_dest a2
|
||||
addi.n a3, a3, 16 // increment arr_src pointer by 12 bytes
|
||||
addi.n a2, a2, 16 // increment arr_dest pointer by 12 bytes
|
||||
._ansi_loop:
|
||||
|
||||
// Finish the remaining bytes out of the loop
|
||||
// Check modulo 8 of the arr_len, if - then copy 8 bytes
|
||||
bbci a4, 3, _mod_8_check // branch if 2-nd bit of arr_len a4 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
|
||||
l32i.n a14, a3, 4 // load 32 bits from arr_src a3 to a15
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
s32i.n a14, a2, 4 // save 32 bits from a15 to arr_dest a2
|
||||
addi.n a3, a3, 8 // increment arr_src pointer by 4 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 4 bytes
|
||||
_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the arr_len, if - then copy 4 bytes
|
||||
bbci a4, 2, _mod_4_check // branch if 2-nd bit of arr_len a4 is clear
|
||||
l32i.n a15, a3, 0 // load 32 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 4 // increment arr_src pointer by 4 bytes
|
||||
s32i.n a15, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the arr_len, if - then copy 2 bytes
|
||||
bbci a4, 1, _mod_2_check // branch if 1-st bit of arr_len a4 is clear
|
||||
l16ui a15, a3, 0 // load 16 bits from arr_src a3 to a15
|
||||
addi.n a3, a3, 2 // increment arr_src pointer by 2 bytes
|
||||
s16i a15, a2, 0 // save 16 bits from a15 to arr_dest a2
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the arr_len, if - then copy 1 byte
|
||||
bbci a4, 0, _mod_1_check // branch if 0-th bit of arr_len a4 is clear
|
||||
l8ui a15, a3, 0 // load 8 bits from arr_src a3 to a15
|
||||
s8i a15, a2, 0 // save 8 bits from a15 to arr_dest a2
|
||||
_mod_1_check:
|
||||
|
||||
// if arr_len is shorter than 16, skip adding TIE instruction, to fix the panic handler before the main_app() loads
|
||||
blti a4, 16, _less_than_16_1 // branch, if arr_len a4 is shorter than 16 bytes
|
||||
#if TIE_ENABLE // put dummy TIE instruction to induce TIE context saving
|
||||
ee.zero.qacc // initialize q0 to zero (dummy instruction)
|
||||
#else // TIE_ENABLE
|
||||
nop // compensate one cycle, when TIE is disabled to get the same benchmark value
|
||||
#endif // TIE_ENABLE
|
||||
_less_than_16_1:
|
||||
|
||||
mov a2, a7 // copy the initial arr_dest pointer from a7 to arr_dest a2
|
||||
retw.n // return
|
||||
|
||||
#endif // MEMCPY_OPTIMIZED
|
||||
|
||||
#endif // dsps_mem_aes3_enbled
|
||||
@@ -0,0 +1,248 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "dsps_mem_platform.h"
|
||||
#if dsps_mem_aes3_enbled
|
||||
|
||||
// This is memory access for ESP32S3 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_memset_aes3
|
||||
.type dsps_memset_aes3,@function
|
||||
// The function implements the following C code:
|
||||
// void *dsps_memset_aes3(void *arr_dest, uint8_t set_val, size_t set_size);
|
||||
|
||||
// Input params Variables
|
||||
//
|
||||
// arr_dest - a2 loop_len - a5
|
||||
// set_val - a3 p_arr_dest - a8
|
||||
// set_size - a4 8_bit_set - a7
|
||||
// 16_bit_set - a9
|
||||
// 32_bit_set - a10
|
||||
// align_mask - a11
|
||||
|
||||
/*
|
||||
esp32s3 optimized memset function works with both, aligned and unaligned data.
|
||||
|
||||
arr_dest aligned - _main_loop, 16 bytes in one loop, only aligned data
|
||||
- Check modulos to finish copying remaining data outside of the cycle
|
||||
- Modulo 8 - S3 instruction for aligned data, the rest of the modulos are generic
|
||||
|
||||
arr_dest unaligned - First, use generic instructions to align the arr_dest data (keep increasing
|
||||
the arr_dest pointer until the pointer is aligned)
|
||||
- Once arr_dest is aligned treat the rest of the data as aligned, same as above
|
||||
|
||||
if the set_size is less than 16, jump to _less_than_16 label and set data without any s3 instructions or cycles
|
||||
*/
|
||||
|
||||
#define MEMSET_OPTIMIZED 1 // Use optimized memset or ansi memset
|
||||
#define TIE_ENABLE 0 // Put a dummy TIE instruction to ANSI memset to induce TIE context saving
|
||||
|
||||
dsps_memset_aes3:
|
||||
|
||||
#if MEMSET_OPTIMIZED
|
||||
|
||||
entry a1, 32
|
||||
mov a8, a2 // a8 - save arr_dest pointer
|
||||
blti a4, 16, _less_than_16 // set_size shorter than 16
|
||||
|
||||
movi.n a7, 0xff // 0xff one-byte mask
|
||||
movi.n a11, 0xf // 0xf alignment mask
|
||||
and a7, a7, a3 // mask upper 24 bits of set_val a3
|
||||
|
||||
bnez.n a7, _non_zero_constant
|
||||
ee.zero.q q0 // initialize q0 to zero
|
||||
movi.n a9, 0 // initialize (16_bit_set) a9 to zero
|
||||
movi.n a10, 0 // initialize (32_bit_set) a10 to zero
|
||||
j _q_reg_prepared
|
||||
|
||||
_non_zero_constant:
|
||||
// Fill q register
|
||||
slli a6, a7, 8 // a6 - (masked)set_val << 8
|
||||
or a9, a6, a7 // a9 - (masked)set_val << 8 + (masked)set_val
|
||||
// a9 - 16-bit set
|
||||
slli a15, a9, 16 // a15 - a9 << 16
|
||||
or a10, a9, a15 // broadcast 8 bits from set_val a3 to 32 bits
|
||||
// a10 - 32-bit set
|
||||
ee.movi.32.q q0, a10, 0 // fill q0 register from a10 by 32 bits
|
||||
ee.movi.32.q q0, a10, 1
|
||||
ee.movi.32.q q0, a10, 2
|
||||
ee.movi.32.q q0, a10, 3
|
||||
|
||||
_q_reg_prepared:
|
||||
|
||||
// alignment check
|
||||
and a15, a11, a2 // 0xf (alignment mask) AND arr_dest pointer
|
||||
beqz a15, _arr_dest_aligned // branch if a15 equals to zero
|
||||
|
||||
movi.n a14, 16 // a14 - 16
|
||||
sub a15, a14, a15 // a15 = 16 - unalignment
|
||||
sub a4, a4, a15 // len = len - (16 - unalignment)
|
||||
|
||||
// keep setting until arr_dest is aligned
|
||||
// Check modulo 8 of the unalignment, if - then set 8 bytes
|
||||
bbci a15, 3, _aligning_mod_8_check // branch if 3-rd bit of unalignment a15 is clear
|
||||
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
|
||||
s32i.n a10, a2, 4 // save 32 bits from a10 to arr_dest a2, offset 4 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
|
||||
_aligning_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the unalignment, if - then set 4 bytes
|
||||
bbci a15, 2, _aligning_mod_4_check // branch if 2-nd bit unalignment a15 is clear
|
||||
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_aligning_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the unalignment, if - then set 2 bytes
|
||||
bbci a15, 1, _aligning_mod_2_check // branch if 1-st bit unalignment a15 is clear
|
||||
s16i a9, a2, 0 // save 16 bits from a9 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_aligning_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the unalignment, if - then copy 1 byte
|
||||
bbci a15, 0, _arr_dest_aligned // branch if 0-th bit unalignment a15 is clear
|
||||
s8i a7, a2, 0 // save 8 bits from a7 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 1 // increment arr_dest pointer by 1 byte
|
||||
|
||||
|
||||
_arr_dest_aligned:
|
||||
// Calculate main loop_len
|
||||
srli a5, a4, 4 // a5 - loop_len = set_size / 16
|
||||
|
||||
// Main loop
|
||||
loopnez a5, ._main_loop // 16 bytes in one loop
|
||||
ee.vst.128.ip q0, a2, 16 // store 16 bytes from q0 to arr_dest a2
|
||||
._main_loop:
|
||||
|
||||
// Check modulo 8 of the set_size, if - then set 8 bytes
|
||||
bbci a4, 3, _aligned_mod_8_check // branch if 3-rd bit of set_size a4 is clear
|
||||
ee.vst.l.64.ip q0, a2, 8 // save lower 64 bits from q0 to arr_dest a2, increase arr_dest pointer by 8 bytes
|
||||
_aligned_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the set_size, if - then set 4 bytes
|
||||
bbci a4, 2, _aligned_mod_4_check // branch if 2-nd bit of set_size a4 is clear
|
||||
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_aligned_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the set_size, if - then set 2 bytes
|
||||
bbci a4, 1, _aligned_mod_2_check // branch if 1-st bit of set_size a4 is clear
|
||||
s16i a9, a2, 0 // save 16 bits from a9 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_aligned_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the set_size, if - then set 1 byte
|
||||
bbci a4, 0, _aligned_mod_1_check // branch if 0-th bit of set_size a4 is clear
|
||||
s8i a7, a2, 0 // save 8 bits from a7 to arr_dest a2, offset 0 bytes
|
||||
_aligned_mod_1_check:
|
||||
|
||||
mov a2, a8 // copy the initial arr_dest pointer from a8 to arr_dest a2
|
||||
retw.n // return
|
||||
|
||||
_less_than_16:
|
||||
|
||||
// make 16-byte set_val
|
||||
slli a6, a3, 8 // a6 - a3 (set_val) << 8
|
||||
or a7, a6, a3 // a7 - a3 (set_val) << 8 + a3 (set_val)
|
||||
|
||||
// Check modulo 8 of the set_size, if - then set 8 bytes
|
||||
bbci a4, 3, _less_than_16_mod_8_check // branch if 3-rd bit of set_size a4 is clear
|
||||
s16i a7, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
|
||||
s16i a7, a2, 2 // save 16 bits from a7 to arr_dest a2, offset 2 bytes
|
||||
s16i a7, a2, 4 // save 16 bits from a7 to arr_dest a2, offset 4 bytes
|
||||
s16i a7, a2, 6 // save 16 bits from a7 to arr_dest a2, offset 6 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 8 bytes
|
||||
_less_than_16_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the set_size, if - then set 4 bytes
|
||||
bbci a4, 2, _less_than_16_mod_4_check // branch if 2-nd bit of set_size a4 is clear
|
||||
s16i a7, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
|
||||
s16i a7, a2, 2 // save 16 bits from a7 to arr_dest a2, offset 2 bytes
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_less_than_16_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the set_size, if - then set 2 bytes
|
||||
bbci a4, 1, _less_than_16_mod_2_check // branch if 1-st bit of set_size a4 is clear
|
||||
s16i a7, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_less_than_16_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the set_size, if - then set 1 byte
|
||||
bbci a4, 0, _less_than_16_mod_1_check // branch if 0-th bit of set_size a4 is clear
|
||||
s8i a3, a2, 0 // save 8 bits from a3 to arr_dest a2, offset 0 bytes
|
||||
_less_than_16_mod_1_check:
|
||||
|
||||
mov a2, a8 // copy the initial arr_dest pointer from a8 to arr_dest a2
|
||||
retw.n // return
|
||||
|
||||
|
||||
#else // MEMSET_OPTIMIZED
|
||||
|
||||
// ansi version of the memset (without TIE instructions) for testing purposes
|
||||
|
||||
entry a1, 32
|
||||
mov a8, a2 // a8 - save arr_dest pointer
|
||||
|
||||
movi.n a7, 0xff // 0xff one-byte mask
|
||||
and a7, a7, a3 // mask upper 24 bits of a3
|
||||
|
||||
slli a6, a7, 8 // a6 - (masked)set_val << 8
|
||||
or a9, a6, a7 // a9 - (masked)set_val << 8 + (masked)set_val
|
||||
// a9 - 16-bit set
|
||||
slli a15, a9, 16 // a15 - a9 << 16
|
||||
or a10, a9, a15 // broadcast 8 bits from a3 to 32 bits
|
||||
|
||||
srli a5, a4, 4 // a5 - loop_len = arr_len / 16
|
||||
|
||||
// Run main loop which sets 16 bytes in one loop run
|
||||
loopnez a5, ._ansi_loop
|
||||
s32i.n a10, a2, 0 // save 32 bits from a15 to arr_dest a2
|
||||
s32i.n a10, a2, 4 // save 32 bits from a14 to arr_dest a2
|
||||
s32i.n a10, a2, 8 // save 32 bits from a14 to arr_dest a2
|
||||
s32i.n a10, a2, 12 // save 32 bits from a14 to arr_dest a2
|
||||
addi.n a2, a2, 16 // increment arr_dest pointer by 8 bytes
|
||||
._ansi_loop:
|
||||
|
||||
// Finish the remaining bytes out of the loop
|
||||
// Check modulo 8 of the arr_len, if - then set 8 bytes
|
||||
bbci a4, 3, _mod_8_check // branch if 2-nd bit of arr_len is clear
|
||||
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
|
||||
s32i.n a10, a2, 4 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 8 // increment arr_dest pointer by 4 bytes
|
||||
_mod_8_check:
|
||||
|
||||
// Check modulo 4 of the arr_len, if - then set 4 bytes
|
||||
bbci a4, 2, _mod_4_check // branch if 2-nd bit of arr_len is clear
|
||||
s32i.n a10, a2, 0 // save 32 bits from a10 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 4 // increment arr_dest pointer by 4 bytes
|
||||
_mod_4_check:
|
||||
|
||||
// Check modulo 2 of the arr_len, if - then set 2 bytes
|
||||
bbci a4, 1, _mod_2_check // branch if 1-st bit of arr_len is clear
|
||||
s16i a9, a2, 0 // save 16 bits from a7 to arr_dest a2, offset 0 bytes
|
||||
addi.n a2, a2, 2 // increment arr_dest pointer by 2 bytes
|
||||
_mod_2_check:
|
||||
|
||||
// Check modulo 1 of the arr_len, if - then set 1 byte
|
||||
bbci a4, 0, _mod_1_check // branch if 0-th bit of arr_len is clear
|
||||
s8i a7, a2, 0 // save 8 bits from a3 to arr_dest a2, offset 0 bytes
|
||||
_mod_1_check:
|
||||
|
||||
// if arr_len is shorter than 16, skip adding TIE instruction, to fix the panic handler before the main_app() loads
|
||||
blti a4, 16, _less_than_16_1 // set_size shorter than 16, to fix panic handler before main_app() load
|
||||
#if TIE_ENABLE // put dummy TIE instruction to induce TIE context saving
|
||||
ee.zero.qacc // initialize q0 to zero
|
||||
#else // TIE_ENABLE
|
||||
nop // compensate one cycle, when TIE is disabled to get the same benchmark value
|
||||
#endif // TIE_ENABLE
|
||||
_less_than_16_1:
|
||||
|
||||
mov a2, a8 // copy the initial arr_dest pointer from a8 to arr_dest a2
|
||||
retw.n // return
|
||||
|
||||
#endif // MEMSET_OPTIMIZED
|
||||
|
||||
#endif // dsps_mem_aes3_enbled
|
||||
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#ifndef _dsps_mem_H_
|
||||
#define _dsps_mem_H_
|
||||
|
||||
#include "dsp_err.h"
|
||||
#include "dsp_common.h"
|
||||
#include "dsps_mem_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief memory copy function using esp32s3 TIE
|
||||
*
|
||||
* The extension (_aes3) is optimized for esp32S3 chip.
|
||||
*
|
||||
* @param arr_dest: pointer to the destination array
|
||||
* @param arr_src: pointer to the source array
|
||||
* @param arr_len: count of bytes to be copied from arr_src to arr_dest
|
||||
*
|
||||
* @return: pointer to dest array
|
||||
*/
|
||||
void *dsps_memcpy_aes3(void *arr_dest, const void *arr_src, size_t arr_len);
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief memory set function using esp32s3 TIE
|
||||
*
|
||||
* The extension (_aes3) is optimized for esp32S3 chip.
|
||||
*
|
||||
* @param arr_dest: pointer to the destination array
|
||||
* @param set_val: byte value, the dest array will be set with
|
||||
* @param set_size: count of bytes, the dest array will be set with
|
||||
*
|
||||
* @return: pointer to dest array
|
||||
*/
|
||||
void *dsps_memset_aes3(void *arr_dest, uint8_t set_val, size_t set_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CONFIG_DSP_OPTIMIZED
|
||||
|
||||
#if dsps_mem_aes3_enbled
|
||||
#define dsps_memcpy dsps_memcpy_aes3
|
||||
#define dsps_memset dsps_memset_aes3
|
||||
#else
|
||||
#define dsps_memcpy memcpy
|
||||
#define dsps_memset memset
|
||||
#endif
|
||||
|
||||
#else // CONFIG_DSP_OPTIMIZED
|
||||
|
||||
#define dsps_memcpy memcpy
|
||||
#define dsps_memset memset
|
||||
|
||||
#endif // CONFIG_DSP_OPTIMIZED
|
||||
#endif // _dsps_mem_H_
|
||||
@@ -0,0 +1,21 @@
|
||||
#ifndef _dsps_mem_platform_H_
|
||||
#define _dsps_mem_platform_H_
|
||||
|
||||
#include "sdkconfig.h"
|
||||
|
||||
#ifdef __XTENSA__
|
||||
#include <xtensa/config/core-isa.h>
|
||||
#include <xtensa/config/core-matmap.h>
|
||||
|
||||
|
||||
#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
|
||||
|
||||
#if CONFIG_IDF_TARGET_ESP32S3
|
||||
#define dsps_mem_aes3_enbled 1
|
||||
#else
|
||||
#define dsps_mem_aes3_enbled 0
|
||||
#endif // CONFIG_IDF_TARGET_ESP32S3
|
||||
|
||||
#endif //
|
||||
#endif // __XTENSA__
|
||||
#endif // _dsps_mem_platform_H_
|
||||
@@ -0,0 +1,728 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include <malloc.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include "unity.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_err.h"
|
||||
#include "esp_dsp.h"
|
||||
|
||||
#include "dsps_mem.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#include "freertos/semphr.h"
|
||||
#include "freertos/queue.h"
|
||||
#include "freertos/timers.h"
|
||||
#include "esp_task_wdt.h"
|
||||
|
||||
#define CORNERS_CPY_SET_COUNT 200
|
||||
#define MEMCPY_REPORT_LEN 100
|
||||
#define MEMSET_REPORT_LEN 50
|
||||
#define CALL_REPEAT_COUNT 1000
|
||||
#define TEST_PINNED_NUM_TASKS 2
|
||||
#define TEST_PINNED_NUM_ITERS 2
|
||||
#define CPY_REPEAT_COUNT 500
|
||||
#define CPY_ITERS 40
|
||||
#define AREA_LENGTH 1024
|
||||
|
||||
static const char *TAG = "dsps_mem_access";
|
||||
|
||||
/*
|
||||
Test functionality of the memcpy and memset functions optimized for esp32s3
|
||||
|
||||
Requires: esp32s3
|
||||
|
||||
Purpose:
|
||||
- Test that esp32s3 optimized memcpy and memset have the same functionality as the original memcpy and memset
|
||||
|
||||
Procedure:
|
||||
- Create 4 arrays, 2 source arrays (aligned and unaligned) and 2 destination arrays (aligned and unaligned)
|
||||
- Initialize the destination arrays to 0, fill the source arrays with non-zero values
|
||||
- Copy the desired length of content from the source array to the destination array using memcpy
|
||||
- Compare the content of the destination array with the content of the source array
|
||||
- Initialize the destination arrays to 0
|
||||
- Repeat the 3 above steps for different copy lengths (especially corner conditions like copy 0, 1, 2... and N, N -1, N - 2.... bytes)
|
||||
and following arrays alignments
|
||||
- destination array 16-byte aligned, source array 16-byte aligned
|
||||
- destination array unaligned, source array 16-byte aligned
|
||||
- destination array 16-byte aligned, source array unaligned
|
||||
- destination array unaligned, source array unaligned
|
||||
- Set the desired length of the destination array using memset
|
||||
- Compare the content of the destination array with the set constant
|
||||
- Initialize the destination arrays to 0
|
||||
- Repeat the 3 above steps for different set lengths (especially corner conditions like copy 0, 1, 2... and N, N -1, N - 2.... bytes)
|
||||
and both alignments of the destination array (16-byte aligned or unaligned)
|
||||
- Free the dynamic array
|
||||
*/
|
||||
|
||||
TEST_CASE("dsps_memcpy_memset_aes3_functionality", "[dsps]")
|
||||
{
|
||||
const size_t arr_len = 1024;
|
||||
const uint8_t set_val = 0xaa;
|
||||
const size_t full_count = arr_len;
|
||||
const size_t canary_bytes = 16; // canary bytes to check a possibe overflow
|
||||
const unsigned int align_combinations_cpy = 4; // source and destination arrays aligned or unaligned combinations
|
||||
const unsigned int align_combinations_set = 2; // destination array aligned or unaligned
|
||||
|
||||
uint8_t *arr_dest_align = (uint8_t *)memalign(16, (arr_len + canary_bytes) * sizeof(uint8_t));
|
||||
uint8_t *arr_src_align = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
|
||||
|
||||
uint8_t *arr_dest_unalign = (uint8_t *)malloc((arr_len + canary_bytes) * sizeof(uint8_t));
|
||||
uint8_t *arr_src_unalign = (uint8_t *)malloc(arr_len * sizeof(uint8_t));
|
||||
uint8_t *arr_dest = NULL, *arr_src = NULL;
|
||||
|
||||
for (int i = 0; i < arr_len; i++) {
|
||||
((uint8_t *)arr_src_align)[i] = (uint8_t)i;
|
||||
((uint8_t *)arr_src_unalign)[i] = (uint8_t)i;
|
||||
}
|
||||
|
||||
// canary bytes
|
||||
for (int i = arr_len; i < (arr_len + canary_bytes); i++) {
|
||||
((uint8_t *)arr_dest_align)[i] = 0;
|
||||
((uint8_t *)arr_dest_unalign)[i] = 0;
|
||||
}
|
||||
|
||||
// aes3 memcpy functionality
|
||||
for (int align = 0; align < align_combinations_cpy; align++) { // alinged and unaligned arrays test loop
|
||||
|
||||
size_t byte_count[2] = {0, full_count - CORNERS_CPY_SET_COUNT}; // amount of bytes to be copied
|
||||
|
||||
switch (align) {
|
||||
case 0: // both 16-byte aligned
|
||||
arr_src = arr_src_align;
|
||||
arr_dest = arr_dest_align;
|
||||
break;
|
||||
|
||||
case 1: // destination unaligned, source aligned
|
||||
arr_src = arr_src_align;
|
||||
arr_dest = arr_dest_unalign;
|
||||
break;
|
||||
|
||||
case 2: // source unaligned, destination aligned
|
||||
arr_src = arr_src_unalign;
|
||||
arr_dest = arr_dest_align;
|
||||
break;
|
||||
|
||||
case 3: // both unaligned
|
||||
arr_src = arr_src_unalign;
|
||||
arr_dest = arr_dest_unalign;
|
||||
break;
|
||||
|
||||
default: // default - both aligned
|
||||
arr_src = arr_src_align;
|
||||
arr_dest = arr_dest_align;
|
||||
break;
|
||||
}
|
||||
|
||||
for (int var = 0; var < 2; var++) { // test conrner conditions
|
||||
for (int j = 0; j < CORNERS_CPY_SET_COUNT; j++) { // mem_set from 1 to CORNERS_CPY_SET_COUNT
|
||||
// from (full_count - CORNERS_CPY_SET_COUNT + 1) to full_count
|
||||
for (int i = 0; i < full_count; i++) { // Destination array initializing
|
||||
((uint8_t *)arr_dest)[i] = 0;
|
||||
}
|
||||
|
||||
dsps_memcpy((void *)arr_dest, (void *)arr_src, ++byte_count[var]);
|
||||
|
||||
TEST_ASSERT_EQUAL_UINT8_ARRAY(arr_src, arr_dest, byte_count[var]);
|
||||
if (byte_count[var] < arr_len) {
|
||||
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[byte_count[var]], (arr_len - byte_count[var]));
|
||||
}
|
||||
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[arr_len], canary_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// aes3 memset functionality
|
||||
for (int align = 0; align < align_combinations_set; align++ ) { // alinged and unaligned arrays test loop
|
||||
|
||||
size_t byte_count[2] = {0, full_count - CORNERS_CPY_SET_COUNT}; // amount of bytes to be copied
|
||||
if (!align) {
|
||||
arr_dest = arr_dest_align;
|
||||
} else {
|
||||
arr_dest = arr_dest_unalign;
|
||||
}
|
||||
|
||||
for (int var = 0; var < 2; var++) { // test conrner conditions
|
||||
for (int j = 0; j < CORNERS_CPY_SET_COUNT; j++) { // mem_set from 1 to CORNERS_CPY_SET_COUNT
|
||||
// from (full_count - CORNERS_CPY_SET_COUNT + 1) to full_count
|
||||
for (int i = 0; i < full_count; i++) { // Destination array initializing
|
||||
((uint8_t *)arr_dest)[i] = 0;
|
||||
}
|
||||
|
||||
dsps_memset((void *)arr_dest, set_val, ++byte_count[var]);
|
||||
|
||||
TEST_ASSERT_EACH_EQUAL_UINT8(set_val, arr_dest, byte_count[var]);
|
||||
if (byte_count[var] < arr_len) {
|
||||
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[byte_count[var]], (arr_len - byte_count[var]));
|
||||
}
|
||||
TEST_ASSERT_EACH_EQUAL_UINT8(0, &arr_dest[arr_len], canary_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(arr_dest_align);
|
||||
free(arr_src_align);
|
||||
free(arr_dest_unalign);
|
||||
free(arr_src_unalign);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Test micro-benchmark of the memcpy and memset functions optimized for esp32s3 and esp32
|
||||
|
||||
Requires: esp32s3
|
||||
|
||||
Purpose:
|
||||
- Test how fast the esp32s3 optimized memcpy and memset are compared to the esp32 optimized memcpy and memset
|
||||
|
||||
Procedure:
|
||||
- Create 2 unaligned arrays, source and destination array
|
||||
- Copy the content of the source array to the destination array using esp32s3 memcpy N times, while counting CPU cycles
|
||||
- Copy the content of the source array to the destination array using esp32 memcpy N times, while counting CPU cycles
|
||||
- Set the destination array using esp32s3 memcpy N times, while counting CPU cycles
|
||||
- Set the destination array using esp32 memcpy N times, while counting CPU cycles
|
||||
- Calculate benchmarks
|
||||
- Free both arrays
|
||||
*/
|
||||
|
||||
TEST_CASE("dsps_memcpy_memset_aes3_benchmark", "[dsps]")
|
||||
{
|
||||
const size_t area_len = AREA_LENGTH; // full length of the area (in bytes)
|
||||
const size_t full_count = sizeof(uint8_t) * area_len;
|
||||
const uint8_t set_val = 0xee; // constant value, the destination array will be set with
|
||||
|
||||
uint8_t *arr_src = (uint8_t *)malloc(area_len * sizeof(uint8_t));
|
||||
uint8_t *arr_dest = (uint8_t *)malloc(area_len * sizeof(uint8_t));
|
||||
|
||||
// Memcpy benchmark
|
||||
const unsigned int start_aes3_memcpy = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
dsps_memcpy((void *)arr_dest, (void *)arr_src, full_count);
|
||||
}
|
||||
const unsigned int end_aes3_memcpy = dsp_get_cpu_cycle_count();
|
||||
|
||||
const unsigned int start_ae32_memcpy = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
memcpy((void *)arr_dest, (void *)arr_src, full_count);
|
||||
}
|
||||
const unsigned int end_ae32_memcpy = dsp_get_cpu_cycle_count();
|
||||
|
||||
const float aes3_cycles_memcpy = ((float)(end_aes3_memcpy - start_aes3_memcpy)) / CALL_REPEAT_COUNT;
|
||||
const float ae32_cycles_memcpy = ((float)(end_ae32_memcpy - start_ae32_memcpy)) / CALL_REPEAT_COUNT;
|
||||
|
||||
ESP_LOGI(TAG, "Micro benchmark of memcpy for unaligned array of %"PRIu32" bytes", (uint32_t)full_count);
|
||||
ESP_LOGI(TAG, "Not-optimized cycles = %.2f", ae32_cycles_memcpy);
|
||||
ESP_LOGI(TAG, "S3 optimized cycles = %.2f", aes3_cycles_memcpy);
|
||||
|
||||
// Memset benchmark
|
||||
const unsigned int start_aes3_memset = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
dsps_memset((void *)arr_dest, set_val, full_count);
|
||||
}
|
||||
const unsigned int end_aes3_memset = dsp_get_cpu_cycle_count();
|
||||
|
||||
const unsigned int start_ae32_memset = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
memset((void *)arr_dest, set_val, full_count);
|
||||
}
|
||||
const unsigned int end_ae32_memset = dsp_get_cpu_cycle_count();
|
||||
|
||||
const float ae32_cycles_memset = ((float)(end_ae32_memset - start_ae32_memset)) / CALL_REPEAT_COUNT;
|
||||
const float aes3_cycles_memset = ((float)(end_aes3_memset - start_aes3_memset)) / CALL_REPEAT_COUNT;
|
||||
|
||||
ESP_LOGI(TAG, "Micro benchmark of memset for unaligned array of %"PRIu32" bytes", (uint32_t)full_count);
|
||||
ESP_LOGI(TAG, "Not-optimized cycles = %.2f", ae32_cycles_memset);
|
||||
ESP_LOGI(TAG, "S3 optimized cycles = %.2f", aes3_cycles_memset);
|
||||
|
||||
free(arr_src);
|
||||
free(arr_dest);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Test micro-benchmark of the memcpy optimized for esp32s3 and esp32 and print a comparison report for copy lengths from
|
||||
1 to 200 bytes, where the difference between the two memcpys is not unanimous
|
||||
|
||||
Requires: esp32s3
|
||||
|
||||
Purpose:
|
||||
- Test how fast the esp32s3 optimized memcpy is to the esp32 optimized memcpy
|
||||
|
||||
Procedure:
|
||||
- Create 2 aligned arrays, source and destination array
|
||||
- Copy the content of the source array to the destination array using esp32s3 memcpy N times, while counting CPU cycles
|
||||
- Copy the content of the source array to the destination array using esp32 memcpy N times, while counting CPU cycles
|
||||
- Calculate benchmarks and save the result
|
||||
- Repeat the 3 above steps for different copy lengths (from 1 to 200 bytes)
|
||||
and following arrays alignments
|
||||
- destination array 16-byte aligned, source array 16-byte aligned
|
||||
- destination array unaligned, source array 16-byte aligned
|
||||
- destination array 16-byte aligned, source array unaligned
|
||||
- destination array unaligned, source array unaligned
|
||||
- Print table of results
|
||||
- Free dynamic arrays
|
||||
*/
|
||||
TEST_CASE("dsps_memcpy_benchmark_report", "[dsps]")
|
||||
{
|
||||
unsigned int start_count, end_count;
|
||||
const unsigned int align_combinations = 4; // source and destination arrays aligned or unaligned combinations
|
||||
const int32_t arr_len = 256;
|
||||
|
||||
uint8_t *arr_dest = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
|
||||
uint8_t *arr_src = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
|
||||
uint8_t *arr_dest_align = NULL, *arr_src_align = NULL;
|
||||
|
||||
uint16_t **result_aes3 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_aes3[align_combinations][MEMCPY_REPORT_LEN]
|
||||
uint16_t **result_ae32 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_ae32[align_combinations][MEMCPY_REPORT_LEN]
|
||||
|
||||
for (int i = 0; i < align_combinations; i++) {
|
||||
result_aes3[i] = (uint16_t *)malloc(MEMCPY_REPORT_LEN * sizeof(uint16_t));
|
||||
result_ae32[i] = (uint16_t *)malloc(MEMCPY_REPORT_LEN * sizeof(uint16_t));
|
||||
}
|
||||
|
||||
for (int iter = 0; iter < align_combinations; iter++) {
|
||||
switch (iter) {
|
||||
case 0: // both 16-byte aligned
|
||||
arr_dest_align = arr_dest;
|
||||
arr_src_align = arr_src;
|
||||
break;
|
||||
|
||||
case 1: // destination unaligned, source aligned
|
||||
arr_dest_align = arr_dest + 1;
|
||||
arr_src_align = arr_src;
|
||||
break;
|
||||
|
||||
case 2: // source unaligned, destination aligned
|
||||
arr_dest_align = arr_dest;
|
||||
arr_src_align = arr_src + 1;
|
||||
break;
|
||||
|
||||
case 3: // both unaligned
|
||||
arr_dest_align = arr_dest + 1;
|
||||
arr_src_align = arr_src + 1;
|
||||
break;
|
||||
|
||||
default: // default - both aligned
|
||||
arr_dest_align = arr_dest;
|
||||
arr_src_align = arr_src;
|
||||
break;
|
||||
}
|
||||
|
||||
for (int cpy_amount = 1; cpy_amount <= MEMCPY_REPORT_LEN; cpy_amount++) {
|
||||
|
||||
start_count = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
dsps_memcpy((void *)arr_dest_align, (void *)arr_src_align, cpy_amount);
|
||||
}
|
||||
end_count = dsp_get_cpu_cycle_count();
|
||||
result_aes3[iter][cpy_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
|
||||
|
||||
start_count = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
memcpy((void *)arr_dest_align, (void *)arr_src_align, cpy_amount);
|
||||
}
|
||||
end_count = dsp_get_cpu_cycle_count();
|
||||
result_ae32[iter][cpy_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
|
||||
}
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "Cycle counts for aligned/unaligned source/destination array using default xtensa memcpy and s3 optimized memcpy");
|
||||
printf("\n\tdest aligned \tdest unaligned\tdest aligned\tdest unaligned\n");
|
||||
printf( "\tsrc aligned \tsrc aligned\tsrc unaligned\tsrc unaligned\n\n");
|
||||
printf( "byte \taes3 ae32\taes3 ae32\taes3 ae32\taes3 ae32\n");
|
||||
|
||||
for (int i = 0; i < MEMCPY_REPORT_LEN; i++) {
|
||||
printf("%d\t", i + 1);
|
||||
|
||||
for (int j = 0; j < align_combinations; j++) {
|
||||
printf(" %d\t", result_aes3[j][i]);
|
||||
printf(" %d\t", result_ae32[j][i]);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
for (int i = 0; i < MEMCPY_REPORT_LEN; i++) {
|
||||
for (int j = 0; j < align_combinations; j++) {
|
||||
TEST_ASSERT_GREATER_OR_EQUAL((result_ae32[j][i]) / 4, result_aes3[j][i]);
|
||||
}
|
||||
}
|
||||
|
||||
free(arr_dest);
|
||||
free(arr_src);
|
||||
free(result_ae32);
|
||||
free(result_aes3);
|
||||
}
|
||||
|
||||
/*
|
||||
Test micro-benchmark of the memset optimized for esp32s3 and esp32 and print a comparison report for set lengths from
|
||||
1 to 200 bytes, where the difference between the two memsets is not unanimous
|
||||
|
||||
Requires: esp32s3
|
||||
|
||||
Purpose:
|
||||
- Test how fast the esp32s3 optimized memset is compared to the esp32 optimized memset
|
||||
|
||||
Procedure:
|
||||
- Create 1 aligned array - destination array
|
||||
- Set the destination array using esp32s3 memcpy N times, while counting CPU cycles
|
||||
- Set the destination array using esp32 memcpy N times, while counting CPU cycles
|
||||
- Calculate benchmarks and save the result
|
||||
- Repeat the 3 above steps for different copy lengths (from 1 to 200 bytes)
|
||||
and both destination arrays alignments (16-byte aligned and unaligned)
|
||||
- Print table of results
|
||||
- Free dynamic arrays
|
||||
*/
|
||||
TEST_CASE("dsps_memset_benchmark_report", "[dsps]")
|
||||
{
|
||||
unsigned int start_count, end_count;
|
||||
const unsigned int align_combinations = 2; // destination arrays aligned or unaligned
|
||||
const int32_t arr_len = 256;
|
||||
const uint8_t set_val = 0xaa;
|
||||
|
||||
uint8_t *arr_dest = (uint8_t *)memalign(16, arr_len * sizeof(uint8_t));
|
||||
uint8_t *arr_dest_align = NULL;
|
||||
|
||||
uint16_t **result_aes3 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_aes3[align_combinations][MEMSET_REPORT_LEN]
|
||||
uint16_t **result_ae32 = (uint16_t **)malloc(align_combinations * sizeof(uint16_t *)); // 2D arrays result_ae32[align_combinations][MEMSET_REPORT_LEN]
|
||||
|
||||
for (int i = 0; i < align_combinations; i++) {
|
||||
result_aes3[i] = (uint16_t *)malloc(MEMSET_REPORT_LEN * sizeof(uint16_t));
|
||||
result_ae32[i] = (uint16_t *)malloc(MEMSET_REPORT_LEN * sizeof(uint16_t));
|
||||
}
|
||||
|
||||
for (int iter = 0; iter < align_combinations; iter++) {
|
||||
|
||||
if (iter == 0) {
|
||||
arr_dest_align = arr_dest; // destination 16-byte aligned
|
||||
} else {
|
||||
arr_dest_align = arr_dest + 1; // destination unaligned
|
||||
}
|
||||
|
||||
for (int set_amount = 1; set_amount <= MEMSET_REPORT_LEN; set_amount++) {
|
||||
start_count = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
dsps_memset((void *)arr_dest_align, set_val, set_amount);
|
||||
}
|
||||
end_count = dsp_get_cpu_cycle_count();
|
||||
result_aes3[iter][set_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
|
||||
|
||||
start_count = dsp_get_cpu_cycle_count();
|
||||
for (int j = 0; j < CALL_REPEAT_COUNT; j++) {
|
||||
memset((void *)arr_dest_align, set_val, set_amount);
|
||||
}
|
||||
end_count = dsp_get_cpu_cycle_count();
|
||||
result_ae32[iter][set_amount - 1] = ((uint16_t)((end_count - start_count) / CALL_REPEAT_COUNT));
|
||||
}
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "Cycle counts for aligned/unaligned destination array using default xtensa memcpy and s3 optimized memcpy");
|
||||
printf("\n\tdest aligned \tdest unaligned\n\n");
|
||||
printf( "byte \taes3 ae32\taes3 ae32\n");
|
||||
|
||||
for (int i = 0; i < MEMSET_REPORT_LEN; i++) {
|
||||
printf("%d\t", i + 1);
|
||||
|
||||
for (int j = 0; j < align_combinations; j++) {
|
||||
printf(" %d\t", result_aes3[j][i]);
|
||||
printf(" %d\t", result_ae32[j][i]);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
for (int i = 0; i < MEMSET_REPORT_LEN; i++) {
|
||||
for (int j = 0; j < align_combinations; j++) {
|
||||
TEST_ASSERT_GREATER_OR_EQUAL((result_ae32[j][i]) / 8, result_aes3[j][i]);
|
||||
}
|
||||
}
|
||||
|
||||
free(arr_dest);
|
||||
free(result_ae32);
|
||||
free(result_aes3);
|
||||
}
|
||||
|
||||
/*
|
||||
Test micro-benchmark of the memcpy and memset functions optimized for esp32s3, with task switching
|
||||
|
||||
Requires: esp32s3
|
||||
|
||||
Purpose:
|
||||
- Test how fast the esp32s3 optimized memcpy and memset are while running memset and memcpy in multiple tasks
|
||||
|
||||
Procedure:
|
||||
- Create 4 tasks - 2 tasks per each core. Tasks are pinned to cores and all the tasks are the same.
|
||||
- Run the memcpy micro-benchmark routine (from the previous test case) in each of the tasks.
|
||||
- Start all the tasks simultaneously
|
||||
- Wait for the tasks to complete, then delete the tasks
|
||||
- Get the benchmark result
|
||||
- Repeat all the above steps with memset, instead of memcpy
|
||||
- Free the created dynamic arrays
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
SemaphoreHandle_t semaphore;
|
||||
uint8_t *arr_src;
|
||||
uint8_t *arr_dest;
|
||||
uint8_t set_val;
|
||||
size_t area_len;
|
||||
uint32_t mean_val_cpy;
|
||||
uint32_t mean_val_set;
|
||||
} test_context_benchmark_t;
|
||||
|
||||
|
||||
static void pinned_task_benchmark_memcpy(void *arg)
|
||||
{
|
||||
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
|
||||
test_context_benchmark_t *context = (test_context_benchmark_t *)arg;
|
||||
long unsigned int cycles_acc = 0;
|
||||
unsigned int start_memcpy_count, end_memcpy_count;
|
||||
|
||||
for (int j = 0; j < CPY_ITERS; j++) {
|
||||
start_memcpy_count = dsp_get_cpu_cycle_count();
|
||||
for (int i = 0; i < CPY_REPEAT_COUNT; i++) {
|
||||
dsps_memcpy((void *)context->arr_dest, (void *)context->arr_src, context->area_len);
|
||||
}
|
||||
end_memcpy_count = dsp_get_cpu_cycle_count();
|
||||
cycles_acc += (end_memcpy_count - start_memcpy_count);
|
||||
vTaskDelay(1); // Block to cause a context switch, forcing the TIE context to be saved
|
||||
}
|
||||
|
||||
context->mean_val_cpy += (uint32_t)((cycles_acc / CPY_REPEAT_COUNT) / CPY_ITERS);
|
||||
|
||||
// Indicate done and wait to be deleted
|
||||
xSemaphoreGive(context->semaphore);
|
||||
vTaskSuspend(NULL);
|
||||
}
|
||||
|
||||
|
||||
static void pinned_task_benchmark_memset(void *arg)
|
||||
{
|
||||
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
|
||||
test_context_benchmark_t *context = (test_context_benchmark_t *)arg;
|
||||
long unsigned int cycles_acc = 0;
|
||||
unsigned int start_memset_count, end_memset_count;
|
||||
|
||||
for (int j = 0; j < CPY_ITERS; j++) {
|
||||
start_memset_count = dsp_get_cpu_cycle_count();
|
||||
for (int i = 0; i < CPY_REPEAT_COUNT; i++) {
|
||||
dsps_memset((void *)context->arr_dest, context->set_val, context->area_len);
|
||||
}
|
||||
end_memset_count = dsp_get_cpu_cycle_count();
|
||||
cycles_acc += (end_memset_count - start_memset_count);
|
||||
vTaskDelay(1); // Block to cause a context switch, forcing the TIE context to be saved
|
||||
}
|
||||
|
||||
context->mean_val_set += (uint32_t)((cycles_acc / CPY_REPEAT_COUNT) / CPY_ITERS);
|
||||
|
||||
// Indicate done and wait to be deleted
|
||||
xSemaphoreGive(context->semaphore);
|
||||
vTaskSuspend(NULL);
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("dsps_memset_memcpy_context_switch_benchmark", "[dsps]")
|
||||
{
|
||||
test_context_benchmark_t test_context;
|
||||
char task_name[10];
|
||||
|
||||
test_context.semaphore = xSemaphoreCreateCounting(configNUM_CORES * TEST_PINNED_NUM_TASKS, 0);
|
||||
test_context.area_len = (size_t)AREA_LENGTH;
|
||||
test_context.arr_dest = (uint8_t *)malloc(AREA_LENGTH * sizeof(uint8_t));
|
||||
test_context.arr_src = (uint8_t *)malloc(AREA_LENGTH * sizeof(uint8_t));
|
||||
test_context.set_val = 0xab;
|
||||
test_context.mean_val_cpy = 0;
|
||||
test_context.mean_val_set = 0;
|
||||
|
||||
static void (*pinned_functions[2])(void *);
|
||||
pinned_functions[0] = pinned_task_benchmark_memcpy;
|
||||
pinned_functions[1] = pinned_task_benchmark_memset;
|
||||
|
||||
TEST_ASSERT_NOT_EQUAL(NULL, test_context.semaphore);
|
||||
|
||||
for (int iter = 0; iter < TEST_PINNED_NUM_ITERS; iter++) {
|
||||
TaskHandle_t task_handles[configNUM_CORES][TEST_PINNED_NUM_TASKS];
|
||||
|
||||
// Create test tasks for each core
|
||||
for (int i = 0; i < configNUM_CORES; i++) {
|
||||
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
|
||||
sprintf(task_name, "task %d-%d", i, j);
|
||||
TEST_ASSERT_EQUAL(pdTRUE, xTaskCreatePinnedToCore(pinned_functions[iter], task_name, 4096,
|
||||
&test_context, 10, &task_handles[i][j], i));
|
||||
}
|
||||
}
|
||||
|
||||
// Start the created tasks simultaneously
|
||||
for (int i = 0; i < configNUM_CORES; i++) {
|
||||
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
|
||||
xTaskNotifyGive(task_handles[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the tasks to complete
|
||||
for (int i = 0; i < configNUM_CORES * TEST_PINNED_NUM_TASKS; i++) {
|
||||
xSemaphoreTake(test_context.semaphore, portMAX_DELAY);
|
||||
}
|
||||
|
||||
// Delete the tasks
|
||||
for (int i = 0; i < configNUM_CORES; i++) {
|
||||
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
|
||||
vTaskDelete(task_handles[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
vTaskDelay(10); // Short delay to allow idle task to be free task memory and TIE contexts
|
||||
}
|
||||
|
||||
vSemaphoreDelete(test_context.semaphore);
|
||||
free(test_context.arr_dest);
|
||||
free(test_context.arr_src);
|
||||
|
||||
const uint32_t iterations = (uint32_t)(configNUM_CORES * TEST_PINNED_NUM_TASKS * CPY_REPEAT_COUNT * CPY_ITERS);
|
||||
const uint32_t copy_mean_val = (uint32_t)(test_context.mean_val_cpy / (configNUM_CORES * TEST_PINNED_NUM_TASKS));
|
||||
const uint32_t set_mean_val = (uint32_t)(test_context.mean_val_set / (configNUM_CORES * TEST_PINNED_NUM_TASKS));
|
||||
|
||||
printf("\nOut of %"PRIu32" iterations, array len of %"PRIu32" bytes\n", iterations, (uint32_t)AREA_LENGTH);
|
||||
printf("Memcpy cycles = %"PRIu32"\n", copy_mean_val);
|
||||
printf("Memset cycles = %"PRIu32"\n", set_mean_val);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Test context switching for the TIE disabled and enabled
|
||||
|
||||
Requires: esp32s3
|
||||
|
||||
Purpose:
|
||||
- Compare context switching between the tasks when TIE (esp32s3 instruction extension) is enabled and disabled to
|
||||
see what is the switching time overhead for the TIE enabled
|
||||
|
||||
Procedure:
|
||||
- Create a timer, 1000 ms is used for this test, but the exact time is not crucial
|
||||
- Create 4 tasks - 2 tasks per each core. Tasks are pinned to cores and all the tasks are the same
|
||||
- Start the created tasks simultaneously, start the timer
|
||||
- A task executes a single assembler instruction from the TIE, to induce the context switch
|
||||
- As soon, as the instruction is executed, a context switch occurs
|
||||
- A counter counts number or context switcher within the timer interval specified by the timer
|
||||
- Wait for the timer to expire and terminate the tasks
|
||||
- Get the number of task switches and delete all the tasks
|
||||
- Repeat the 7 above steps with the created tasks executing a single generic Xtensa assembler instruction,
|
||||
instead of the TIE instruction to get the switching overhead
|
||||
*/
|
||||
|
||||
static bool timer_expired = false;
|
||||
static TimerHandle_t one_shot_timer = NULL;
|
||||
|
||||
typedef struct {
|
||||
SemaphoreHandle_t semaphore;
|
||||
uint32_t switch_count_tie_on;
|
||||
uint32_t switch_count_tie_off;
|
||||
} test_context_timing_t;
|
||||
|
||||
// Taks pinned to a core, executing TIE instruction
|
||||
static void pinned_task_tie_on(void *arg)
|
||||
{
|
||||
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
|
||||
test_context_timing_t *context = (test_context_timing_t *)arg;
|
||||
vTaskDelay(1);
|
||||
|
||||
while (!timer_expired) {
|
||||
asm volatile("ee.zero.q q0");
|
||||
context->switch_count_tie_on++;
|
||||
taskYIELD(); // Block to cause a context switch, forcing the TIE context to be saved
|
||||
}
|
||||
xSemaphoreGive(context->semaphore);
|
||||
vTaskSuspend(NULL);
|
||||
}
|
||||
|
||||
// Taks pinned to a core, executing generic Xtensa instruction
|
||||
static void pinned_task_tie_off(void *arg)
|
||||
{
|
||||
ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
|
||||
test_context_timing_t *context = (test_context_timing_t *)arg;
|
||||
vTaskDelay(1);
|
||||
|
||||
while (!timer_expired) {
|
||||
asm volatile("nop");
|
||||
context->switch_count_tie_off++;
|
||||
taskYIELD(); // Block to cause a context switch, forcing the context to be saved
|
||||
}
|
||||
|
||||
xSemaphoreGive(context->semaphore);
|
||||
vTaskSuspend(NULL);
|
||||
}
|
||||
|
||||
static void context_switch_timer_callback(TimerHandle_t xTimer)
|
||||
{
|
||||
timer_expired = true;
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("dsps_TIE_context_switch_timing", "[dsps]")
|
||||
{
|
||||
test_context_timing_t test_context;
|
||||
const TickType_t timer_period_ms = 1000;
|
||||
char task_name[10];
|
||||
|
||||
test_context.semaphore = xSemaphoreCreateCounting(configNUM_CORES * TEST_PINNED_NUM_TASKS, 0);
|
||||
test_context.switch_count_tie_off = 0;
|
||||
test_context.switch_count_tie_on = 0;
|
||||
TEST_ASSERT_NOT_EQUAL(NULL, test_context.semaphore);
|
||||
|
||||
static void (*pinned_functions[2])(void *);
|
||||
pinned_functions[0] = pinned_task_tie_on;
|
||||
pinned_functions[1] = pinned_task_tie_off;
|
||||
|
||||
one_shot_timer = xTimerCreate("timer", pdMS_TO_TICKS(timer_period_ms), pdFALSE, (void *)0, context_switch_timer_callback);
|
||||
|
||||
for (int iter = 0; iter < TEST_PINNED_NUM_ITERS; iter++) {
|
||||
timer_expired = false;
|
||||
TaskHandle_t task_handles[configNUM_CORES][TEST_PINNED_NUM_TASKS];
|
||||
|
||||
// Create test tasks for each core
|
||||
for (int i = 0; i < configNUM_CORES; i++) {
|
||||
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
|
||||
sprintf(task_name, "task %d-%d", i, j);
|
||||
TEST_ASSERT_EQUAL(pdTRUE, xTaskCreatePinnedToCore(pinned_functions[iter], task_name, 4096,
|
||||
&test_context, 1, &task_handles[i][j], i));
|
||||
}
|
||||
}
|
||||
|
||||
// Start the created tasks simultaneously
|
||||
for (int i = 0; i < configNUM_CORES; i++) {
|
||||
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
|
||||
xTaskNotifyGive(task_handles[i][j]);
|
||||
}
|
||||
}
|
||||
xTimerStart(one_shot_timer, portMAX_DELAY);
|
||||
vTaskDelay(1);
|
||||
|
||||
// Wait for the tasks to complete
|
||||
for (int i = 0; i < configNUM_CORES * TEST_PINNED_NUM_TASKS; i++) {
|
||||
xSemaphoreTake(test_context.semaphore, portMAX_DELAY);
|
||||
}
|
||||
|
||||
// Delete the tasks
|
||||
for (int i = 0; i < configNUM_CORES; i++) {
|
||||
for (int j = 0; j < TEST_PINNED_NUM_TASKS; j++) {
|
||||
vTaskDelete(task_handles[i][j]);
|
||||
}
|
||||
}
|
||||
vTaskDelay(10); // Short delay to allow idle task to be free task memory and TIE contexts
|
||||
}
|
||||
|
||||
vSemaphoreDelete(test_context.semaphore);
|
||||
|
||||
printf("\nContext switching count within %"PRIu32" ms nterval\n", (uint32_t)timer_period_ms);
|
||||
printf("TIE enabled %"PRIu32"\n", test_context.switch_count_tie_on);
|
||||
printf("TIE disabled %"PRIu32"\n", test_context.switch_count_tie_off);
|
||||
|
||||
float overhead = (((float)test_context.switch_count_tie_off / (float)test_context.switch_count_tie_on) * 100) - 100;
|
||||
printf("Switch overhead %.2f %%\n", overhead);
|
||||
}
|
||||
Reference in New Issue
Block a user