add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,398 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_61, 458755
# Program Unit: dspi_dotprod_off_s16_aes3
.type dspi_dotprod_off_s16_aes3, @function
.align 4
.global dspi_dotprod_off_s16_aes3
dspi_dotprod_off_s16_aes3: # 0x4
.LBB1_dspi_dotprod_off_s16_aes3: # 0x4
entry a1,128 #
l32i.n a10,a2,4 # [0] id:760
l32i.n a12,a2,12 # [1] id:759
mull a8,a10,a5 # [2]
blt a12,a8,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
l32i.n a13,a2,8 # [0] id:761
l32i.n a9,a2,16 # [1] id:762
mull a11,a13,a6 # [2]
blt a9,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:764
l32i.n a14,a3,12 # [1] id:763
mull a11,a15,a5 # [2]
blt a14,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:766
l32i.n a9,a3,8 # [1] id:765
s32i a9,a1,88 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB83_dspi_dotprod_off_s16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:767
s32i a8,a1,84 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_36354 # [2]
bne a14,a11,.Lt_0_36354 # [0]
bnei a15,1,.Lt_0_36354 # [0]
l32i a9,a1,88 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_19458 # [2]
.Lt_0_36354: # 0x46
.Lt_0_19714: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16si a8,a1,128 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:875
.type dspi_dotprod_off_s16_ansi, @function
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB83_dspi_dotprod_off_s16_aes3: # 0x5e
l32r a2,.LC0_1_61 # [0]
retw.n # [1]
.Lt_0_19458: # 0x63
addi.n a9,a10,-1 # [0]
bnez a9,.Lt_0_37122 # [1]
addi.n a10,a13,-1 # [0]
bnez a10,.Lt_0_37122 # [1]
extui a11,a5,0,3 # [0]
bnez.n a11,.Lt_0_37122 # [1]
blti a6,4,.Lt_0_37122 # [0]
movi.n a14,32 # [0]
blt a14,a5,.LBB27_dspi_dotprod_off_s16_aes3 # [1]
.Lt_0_37634: # 0x7a
.Lt_0_21506: # 0x7a
l32i a15,a1,84 # [0] gra_spill_temp_1
l32i.n a2,a2,0 # [1] id:769
l16si a9,a1,128 # [2] id:768 offset+0x0
mull a10,a12,a13 # [3]
addi a8,a1,16 # [4] temp_offset
slli a10,a10,1 # [5]
s32i a10,a1,80 # [6] gra_spill_temp_0
movi.n a10,2 # [7]
# loop-count fixed at 2
loop a10,.LBB137_dspi_dotprod_off_s16_aes3 # [8]
.LBB132_dspi_dotprod_off_s16_aes3: # 0x93
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
addi a8,a8,16 # [0*II+8]
.LBB137_dspi_dotprod_off_s16_aes3: # 0xae
mov.n a3,a6 # [0]
addi a11,a5,-24 # [1]
addi a12,a1,24 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.sar_byte a13 # [5]
wur.accx_0 a13 # [6]
wur.accx_1 a13 # [7]
ee.vld.128.ip q6,a12,0 # [8] id:771
s32i.n a12,a1,48 # [9] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_s16_aes3 # [10]
.Lt_0_25602: # 0xc8
.Lt_0_25090: # 0xc8
ee.vld.128.ip q0,a15,16 # [0] id:786
addi a14,a5,-16 # [1]
beqz a14,.LBB40_dspi_dotprod_off_s16_aes3 # [2]
.Lt_0_27138: # 0xd1
.Lt_0_26626: # 0xd1
addi a8,a5,-8 # [0]
beqz a8,.LBB46_dspi_dotprod_off_s16_aes3 # [1]
.Lt_0_28674: # 0xd7
.Lt_0_28162: # 0xd7
addi a9,a5,-32 # [0]
beqz a9,.LBB52_dspi_dotprod_off_s16_aes3 # [1]
.Lt_0_30210: # 0xdd
.Lt_0_29698: # 0xdd
addi a10,a5,-64 # [0]
beqz a10,.LBB58_dspi_dotprod_off_s16_aes3 # [1]
movi.n a11,64 # [0]
bge a11,a5,.Lt_0_33026 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:848
ee.ld.128.usar.ip q2,a2,16 # [2] id:849
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:850
beqz.n a3,.Lt_0_33026 # [5]
slli a8,a5,1 # [0]
l32i a14,a1,80 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a8 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_33794: # 0x10c
beqz.n a13,.Lt_0_34050 # [0]
loopnez a13,.LBB273_dspi_dotprod_off_s16_aes3 # [0]
.LBB271_dspi_dotprod_off_s16_aes3: # 0x111
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:851
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+1] id:852
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q1,q2,q3,q0 # [0*II+3] id:853
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+4] id:854
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q4,q3,q0,q1 # [0*II+6] id:855
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:856
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+9] id:857
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:858
.LBB273_dspi_dotprod_off_s16_aes3: # 0x131
.Lt_0_34050: # 0x131
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0] id:859
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [1] id:860
movi.n a9,32 # [2]
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q1,q2,q3,q0 # [3] id:861
ee.vmulas.s16.accx.ld.ip q5,a15,16,q2,q6 # [4] id:862
movi.n a10,-16 # [5]
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q5,q3,q0,q7 # [6] id:863
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [7] id:865
ee.ld.128.usar.xp q1,a2,a9 # [8] id:864
addi.n a12,a12,1 # [9]
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [10] id:866
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [11] id:867
bne a12,a3,.Lt_0_33794 # [12]
.Lt_0_33026: # 0x15d
.Lt_0_32770: # 0x15d
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_35586 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srai a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:873
retw.n # [10]
.Lt_0_37122: # 0x183
.Lt_0_20738: # 0x183
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16si a8,a1,128 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:876
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_s16_aes3: # 0x19b
extui a9,a5,0,1 # [0]
beqz a9,.Lt_0_37634 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16si a8,a1,128 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:877
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_s16_aes3: # 0x1b9
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i a12,a1,80 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q0,a2,16 # [3] id:776
ee.ld.128.usar.ip q2,a2,16 # [4] id:777
addi a12,a12,-32 # [5]
ee.src.q.ld.ip q3,a2,16,q0,q2 # [6] id:778
loopgtz a6,.LBB159_dspi_dotprod_off_s16_aes3 # [7]
.LBB157_dspi_dotprod_off_s16_aes3: # 0x1cf
ee.vmulas.s16.accx.ld.ip q1,a15,16,q0,q6 # [0*II+0] id:779
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q1,q0,q2,q3 # [0*II+2] id:780
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+3] id:781
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q2,q3,q1 # [0*II+5] id:782
ee.vmulas.s16.accx.ld.ip q1,a15,16,q3,q6 # [0*II+6] id:784
ee.ld.128.usar.xp q0,a2,a10 # [0*II+7] id:783
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q1,q3,q0,q2 # [0*II+9] id:785
.LBB159_dspi_dotprod_off_s16_aes3: # 0x1ea
j .Lt_0_25602 # [0]
.LBB40_dspi_dotprod_off_s16_aes3: # 0x1ed
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i a12,a1,80 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:787
ee.ld.128.usar.ip q2,a2,16 # [5] id:788
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:789
loopnez a3,.LBB182_dspi_dotprod_off_s16_aes3 # [9]
.LBB180_dspi_dotprod_off_s16_aes3: # 0x206
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a11,q0,q1,q2,q3 # [0*II+0] id:790
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+1] id:791
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:792
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q2,q1,q0 # [0*II+4] id:793
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+5] id:794
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q1,q0,q3 # [0*II+7] id:795
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+8] id:796
ee.ld.128.usar.xp q1,a2,a10 # [0*II+9] id:797
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q0,q1,q2 # [0*II+11] id:798
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+12] id:799
.LBB182_dspi_dotprod_off_s16_aes3: # 0x22c
j .Lt_0_27138 # [0]
.LBB46_dspi_dotprod_off_s16_aes3: # 0x22f
movi.n a10,-16 # [0]
l32i a11,a1,80 # [1] gra_spill_temp_0
addi a8,a2,16 # [2]
addi a11,a11,16 # [3]
ee.ld.128.usar.xp q2,a8,a10 # [4] id:800
ee.ld.128.usar.xp q1,a8,a11 # [5] id:801
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [7] id:802
ee.ld.128.usar.xp q2,a8,a11 # [8] id:803
srli a3,a3,2 # [9]
mov.n a2,a8 # [10]
loopnez a3,.LBB205_dspi_dotprod_off_s16_aes3 # [11]
.LBB203_dspi_dotprod_off_s16_aes3: # 0x24e
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q2,q3 # [0*II+0] id:804
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+1] id:805
ee.ld.128.usar.xp q1,a2,a11 # [0*II+2] id:806
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q2,q1,q3 # [0*II+4] id:807
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+5] id:808
ee.ld.128.usar.xp q4,a2,a11 # [0*II+6] id:809
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q4,q3 # [0*II+8] id:810
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+9] id:811
ee.ld.128.usar.xp q1,a2,a11 # [0*II+10] id:812
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q4,q1,q3 # [0*II+12] id:813
ee.vmulas.s16.accx.ld.ip q0,a15,16,q4,q6 # [0*II+13] id:814
ee.ld.128.usar.xp q2,a2,a11 # [0*II+14] id:815
.LBB205_dspi_dotprod_off_s16_aes3: # 0x27a
j .Lt_0_28674 # [0]
.LBB52_dspi_dotprod_off_s16_aes3: # 0x27d
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i a12,a1,80 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:816
ee.ld.128.usar.ip q2,a2,16 # [5] id:817
sub a12,a12,a13 # [6]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:818
addi a12,a12,16 # [9]
loopnez a3,.LBB228_dspi_dotprod_off_s16_aes3 # [10]
.LBB226_dspi_dotprod_off_s16_aes3: # 0x299
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:819
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:820
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q3,q0 # [0*II+3] id:821
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:822
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q0,q4 # [0*II+6] id:823
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:825
ee.ld.128.usar.xp q1,a2,a10 # [0*II+8] id:824
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+10] id:826
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+11] id:827
.LBB228_dspi_dotprod_off_s16_aes3: # 0x2bc
j .Lt_0_30210 # [0]
.LBB58_dspi_dotprod_off_s16_aes3: # 0x2bf
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i a12,a1,80 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:828
ee.ld.128.usar.ip q2,a2,16 # [5] id:829
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:830
mov.n a8,a2 # [10]
loopnez a3,.LBB250_dspi_dotprod_off_s16_aes3 # [11]
.LBB248_dspi_dotprod_off_s16_aes3: # 0x2dd
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:831
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:832
ee.vmulas.s16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:833
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:834
ee.vmulas.s16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:835
ee.vmulas.s16.accx.ld.ip q5,a15,16,q3,q6 # [0*II+7] id:836
ee.vmulas.s16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:837
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:838
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:839
ee.vmulas.s16.accx.ld.ip q4,a15,16,q4,q6 # [0*II+13] id:840
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:841
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+16] id:842
ee.vmulas.s16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:843
ee.vmulas.s16.accx.ld.ip q4,a15,16,q5,q6 # [0*II+19] id:845
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:844
ee.vmulas.s16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:846
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+23] id:847
.LBB250_dspi_dotprod_off_s16_aes3: # 0x320
j .Lt_0_33026 # [0]
.Lt_0_35586: # 0x323
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:874
retw.n # [3]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int16_t *i_data = (int16_t *)in_image->data;
int16_t *f_data = (int16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,104 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_s16_arp4
.global dspi_dotprod_off_s16_ansi
.type dspi_dotprod_off_s16_arp4,@function
// esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
dspi_dotprod_off_s16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// current i_data - t4
// current f_data - t5
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_off_s16_arp4_body
j dspi_dotprod_off_s16_ansi
.dspi_dotprod_off_s16_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.16.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.s16 q3, q2, q1
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,408 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_57, 458755
# Program Unit: dspi_dotprod_off_s8_aes3
.type dspi_dotprod_off_s8_aes3, @function
.align 4
.global dspi_dotprod_off_s8_aes3
dspi_dotprod_off_s8_aes3: # 0x4
.LBB1_dspi_dotprod_off_s8_aes3: # 0x4
entry a1,112 #
l32i.n a10,a2,4 # [0] id:745
l32i.n a12,a2,12 # [1] id:744
mull a8,a10,a5 # [2]
blt a12,a8,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
l32i.n a13,a2,8 # [0] id:746
l32i.n a9,a2,16 # [1] id:747
mull a11,a13,a6 # [2]
blt a9,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:749
l32i.n a14,a3,12 # [1] id:748
mull a11,a15,a5 # [2]
blt a14,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:751
l32i.n a9,a3,8 # [1] id:750
s32i a9,a1,72 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB86_dspi_dotprod_off_s8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:752
s32i a8,a1,68 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_35330 # [2]
bne a14,a11,.Lt_0_35330 # [0]
bnei a15,1,.Lt_0_35330 # [0]
l32i a11,a1,72 # [0] gra_spill_temp_2
beqi a11,1,.Lt_0_18946 # [2]
.Lt_0_35330: # 0x46
.Lt_0_19202: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s8_ansi, @function
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB86_dspi_dotprod_off_s8_aes3: # 0x59
l32r a2,.LC0_1_57 # [0]
retw.n # [1]
.Lt_0_18946: # 0x5e
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_36098 # [1]
addi.n a15,a13,-1 # [0]
bnez a15,.Lt_0_36098 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_36098 # [1]
blti a6,4,.Lt_0_36098 # [0]
movi.n a9,64 # [0]
blt a9,a5,.LBB27_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_36610: # 0x75
.Lt_0_20994: # 0x75
mov.n a8,a1 # [0]
l8ui a9,a1,112 # [1] id:754 offset+0x0
l32i.n a15,a2,0 # [2] id:753
mull a10,a12,a13 # [3]
l32i a2,a1,68 # [4] gra_spill_temp_1
s32i a10,a1,64 # [5] gra_spill_temp_0
sext a9,a9,7 # [6]
movi.n a10,4 # [7]
# loop-count fixed at 4
loop a10,.LBB140_dspi_dotprod_off_s8_aes3 # [8]
.LBB135_dspi_dotprod_off_s8_aes3: # 0x8d
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
addi.n a8,a8,8 # [0*II+8]
.LBB140_dspi_dotprod_off_s8_aes3: # 0xa7
mov.n a3,a6 # [0]
addi a11,a5,-48 # [1]
addi.n a12,a1,8 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.accx_0 a13 # [5]
wur.accx_1 a13 # [6]
ee.vld.128.ip q6,a12,0 # [7] id:756
s32i.n a12,a1,32 # [8] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_s8_aes3 # [9]
l32i a2,a1,68 # [0] gra_spill_temp_1
ee.vld.128.ip q0,a2,16 # [2] id:771
st.qr q0,a1,48 # [3] q0
.Lt_0_24578: # 0xc6
addi a14,a5,-32 # [0]
beqz a14,.LBB43_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_26626: # 0xcc
.Lt_0_26114: # 0xcc
addi a8,a5,-16 # [0]
beqz a8,.LBB50_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_28162: # 0xd2
.Lt_0_27650: # 0xd2
addi a9,a5,-64 # [0]
beqz a9,.LBB57_dspi_dotprod_off_s8_aes3 # [1]
.Lt_0_29698: # 0xd8
.Lt_0_29186: # 0xd8
addi a10,a5,-128 # [0]
beqz a10,.LBB64_dspi_dotprod_off_s8_aes3 # [1]
movi a11,128 # [0]
bge a11,a5,.Lt_0_32514 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
beqz.n a3,.Lt_0_32514 # [5]
ld.qr q0,a1,48 # [0] q0
l32i a14,a1,64 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a5 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_33282: # 0x108
beqz.n a13,.Lt_0_33538 # [0]
loopnez a13,.LBB277_dspi_dotprod_off_s8_aes3 # [0]
.LBB275_dspi_dotprod_off_s8_aes3: # 0x10d
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
ee.vmulas.s8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
ee.vmulas.s8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
.LBB277_dspi_dotprod_off_s8_aes3: # 0x12d
.Lt_0_33538: # 0x12d
ee.vmulas.s8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
movi.n a8,32 # [2]
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
ee.vmulas.s8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
movi.n a9,-16 # [5]
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
addi.n a12,a12,1 # [9]
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
bne a12,a3,.Lt_0_33282 # [12]
.Lt_0_32514: # 0x159
.Lt_0_32258: # 0x159
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
.Lt_0_36098: # 0x175
.Lt_0_20226: # 0x175
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_s8_aes3: # 0x188
extui a14,a5,0,1 # [0]
beqz a14,.Lt_0_36610 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_s8_aes3: # 0x1a1
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
beqz.n a6,.Lt_0_24578 # [4]
movi.n a10,32 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB163_dspi_dotprod_off_s8_aes3 # [4]
.LBB161_dspi_dotprod_off_s8_aes3: # 0x1b9
ee.vmulas.s8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
ee.vmulas.s8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
ee.vmulas.s8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
.LBB163_dspi_dotprod_off_s8_aes3: # 0x1d4
st.qr q1,a1,48 # [0] q0
j .Lt_0_24578 # [1]
.LBB43_dspi_dotprod_off_s8_aes3: # 0x1da
srli a3,a6,1 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
addi a12,a12,-16 # [5]
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
beqz.n a3,.Lt_0_26626 # [7]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopnez a3,.LBB186_dspi_dotprod_off_s8_aes3 # [3]
.LBB184_dspi_dotprod_off_s8_aes3: # 0x1f8
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
.LBB186_dspi_dotprod_off_s8_aes3: # 0x21e
st.qr q0,a1,48 # [0] q0
j .Lt_0_26626 # [1]
.LBB50_dspi_dotprod_off_s8_aes3: # 0x224
srli a3,a3,2 # [0]
movi.n a13,-16 # [1]
l32i a11,a1,64 # [2] gra_spill_temp_0
addi a15,a15,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
beqz.n a3,.Lt_0_28162 # [10]
ld.qr q0,a1,48 # [0] q0
movi.n a10,-16 # [1]
loopnez a3,.LBB209_dspi_dotprod_off_s8_aes3 # [2]
.LBB207_dspi_dotprod_off_s8_aes3: # 0x248
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
.LBB209_dspi_dotprod_off_s8_aes3: # 0x274
st.qr q0,a1,48 # [0] q0
j .Lt_0_28162 # [1]
.LBB57_dspi_dotprod_off_s8_aes3: # 0x27a
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
beqz.n a3,.Lt_0_29698 # [4]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
movi.n a11,-16 # [3]
sub a12,a12,a5 # [4]
addi a12,a12,16 # [5]
loopnez a3,.LBB232_dspi_dotprod_off_s8_aes3 # [6]
.LBB230_dspi_dotprod_off_s8_aes3: # 0x298
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
ee.vmulas.s8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
.LBB232_dspi_dotprod_off_s8_aes3: # 0x2bb
st.qr q0,a1,48 # [0] q0
j .Lt_0_29698 # [1]
.LBB64_dspi_dotprod_off_s8_aes3: # 0x2c1
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ld.qr q0,a1,48 # [8] q0
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
mov.n a8,a15 # [10]
loopnez a3,.LBB254_dspi_dotprod_off_s8_aes3 # [11]
.LBB252_dspi_dotprod_off_s8_aes3: # 0x2df
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
ee.vmulas.s8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
ee.vmulas.s8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
ee.vmulas.s8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
ee.vmulas.s8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
ee.vmulas.s8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
ee.vmulas.s8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
ee.vmulas.s8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
.LBB254_dspi_dotprod_off_s8_aes3: # 0x322
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int8_t *i_data = (int8_t *)in_image->data;
int8_t *f_data = (int8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,102 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_s8_arp4
.global dspi_dotprod_off_s8_ansi
.type dspi_dotprod_off_s8_arp4,@function
// esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int8_t offset);
dspi_dotprod_off_s8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_off_s8_arp4_body
j dspi_dotprod_off_s8_ansi
.dspi_dotprod_off_s8_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.8.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.s8 q3, q2, q1
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,417 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_61, 458755
# Program Unit: dspi_dotprod_off_u16_aes3
.type dspi_dotprod_off_u16_aes3, @function
.align 4
.global dspi_dotprod_off_u16_aes3
dspi_dotprod_off_u16_aes3: # 0x4
.LBB1_dspi_dotprod_off_u16_aes3: # 0x4
entry a1,144 #
l32i.n a10,a2,4 # [0] id:760
l32i.n a12,a2,12 # [1] id:759
mull a8,a10,a5 # [2]
blt a12,a8,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
l32i.n a13,a2,8 # [0] id:761
l32i.n a9,a2,16 # [1] id:762
mull a11,a13,a6 # [2]
blt a9,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:764
l32i.n a14,a3,12 # [1] id:763
mull a11,a15,a5 # [2]
blt a14,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:766
l32i.n a9,a3,8 # [1] id:765
s32i a9,a1,104 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB89_dspi_dotprod_off_u16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:767
s32i a8,a1,100 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_36354 # [2]
bne a14,a11,.Lt_0_36354 # [0]
bnei a15,1,.Lt_0_36354 # [0]
l32i a9,a1,104 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_19458 # [2]
.Lt_0_36354: # 0x46
.Lt_0_19714: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16ui a8,a1,144 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:876
.type dspi_dotprod_off_u16_ansi, @function
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB89_dspi_dotprod_off_u16_aes3: # 0x5e
l32r a2,.LC0_1_61 # [0]
retw.n # [1]
.Lt_0_19458: # 0x63
addi.n a9,a10,-1 # [0]
bnez a9,.Lt_0_37122 # [1]
addi.n a10,a13,-1 # [0]
bnez a10,.Lt_0_37122 # [1]
extui a11,a5,0,3 # [0]
bnez.n a11,.Lt_0_37122 # [1]
blti a6,4,.Lt_0_37122 # [0]
movi.n a14,32 # [0]
blt a14,a5,.LBB27_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_37634: # 0x7a
.Lt_0_21506: # 0x7a
l16ui a9,a1,144 # [0] id:768 offset+0x0
addi a8,a1,16 # [1] temp_offset
l32i.n a15,a2,0 # [2] id:769
mull a10,a12,a13 # [3]
l32i a2,a1,100 # [4] gra_spill_temp_1
slli a10,a10,1 # [5]
s32i a10,a1,96 # [6] gra_spill_temp_0
movi.n a10,2 # [7]
# loop-count fixed at 2
loop a10,.LBB143_dspi_dotprod_off_u16_aes3 # [8]
.LBB138_dspi_dotprod_off_u16_aes3: # 0x93
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
addi a8,a8,16 # [0*II+8]
.LBB143_dspi_dotprod_off_u16_aes3: # 0xae
mov.n a3,a6 # [0]
addi a11,a5,-24 # [1]
addi a12,a1,24 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.sar_byte a13 # [5]
wur.accx_0 a13 # [6]
wur.accx_1 a13 # [7]
ee.vld.128.ip q6,a12,0 # [8] id:771
s32i.n a12,a1,48 # [9] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_u16_aes3 # [10]
l32i a2,a1,100 # [0] gra_spill_temp_1
ee.vld.128.ip q0,a2,16 # [2] id:787
st.qr q0,a1,64 # [3] q0
.Lt_0_25090: # 0xd1
addi a14,a5,-16 # [0]
beqz a14,.LBB43_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_27138: # 0xd7
.Lt_0_26626: # 0xd7
addi a8,a5,-8 # [0]
beqz a8,.LBB50_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_28674: # 0xdd
.Lt_0_28162: # 0xdd
addi a9,a5,-32 # [0]
beqz a9,.LBB57_dspi_dotprod_off_u16_aes3 # [1]
.Lt_0_30210: # 0xe3
.Lt_0_29698: # 0xe3
addi a10,a5,-64 # [0]
beqz a10,.LBB64_dspi_dotprod_off_u16_aes3 # [1]
movi.n a11,64 # [0]
bge a11,a5,.Lt_0_33026 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a15,16 # [1] id:849
ee.ld.128.usar.ip q2,a15,16 # [2] id:850
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:851
beqz.n a3,.Lt_0_33026 # [5]
ld.qr q0,a1,64 # [0] q0
slli a8,a5,1 # [1]
l32i a14,a1,96 # [2] gra_spill_temp_0
addi a13,a5,31 # [3]
movgez a13,a5,a5 # [4]
srai a13,a13,5 # [5]
sub a14,a14,a8 # [6]
addi a14,a14,16 # [7]
addi.n a13,a13,-1 # [8]
.Lt_0_33794: # 0x115
beqz.n a13,.Lt_0_34050 # [0]
loopnez a13,.LBB280_dspi_dotprod_off_u16_aes3 # [0]
.LBB278_dspi_dotprod_off_u16_aes3: # 0x11a
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:852
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:853
ee.vmulas.u16.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:854
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:855
ee.vmulas.u16.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:856
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:857
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:858
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:859
.LBB280_dspi_dotprod_off_u16_aes3: # 0x13a
.Lt_0_34050: # 0x13a
ee.vmulas.u16.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:860
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [1] id:861
movi.n a9,32 # [2]
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:862
ee.vmulas.u16.accx.ld.ip q7,a2,16,q2,q6 # [4] id:863
movi.n a10,-16 # [5]
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a10,q7,q3,q4,q0 # [6] id:864
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [7] id:866
ee.ld.128.usar.xp q1,a15,a9 # [8] id:865
addi.n a12,a12,1 # [9]
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:867
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [11] id:868
bne a12,a3,.Lt_0_33794 # [12]
.Lt_0_33026: # 0x166
.Lt_0_32770: # 0x166
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_35586 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srli a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:874
retw.n # [10]
.Lt_0_37122: # 0x18c
.Lt_0_20738: # 0x18c
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16ui a8,a1,144 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:877
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_u16_aes3: # 0x1a4
extui a9,a5,0,1 # [0]
beqz a9,.Lt_0_37634 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
l16ui a8,a1,144 # [6] id:768 offset+0x0
s32i.n a8,a1,0 # [7] id:878
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_u16_aes3: # 0x1c2
ee.ld.128.usar.ip q0,a15,16 # [0] id:776
ee.ld.128.usar.ip q2,a15,16 # [1] id:777
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:778
beqz.n a6,.Lt_0_25090 # [4]
movi.n a10,32 # [0]
l32i a12,a1,96 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB166_dspi_dotprod_off_u16_aes3 # [4]
.LBB164_dspi_dotprod_off_u16_aes3: # 0x1da
ee.vmulas.u16.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:779
ee.vmulas.u16.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:780
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:781
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:782
ee.vmulas.u16.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:784
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:783
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:785
.LBB166_dspi_dotprod_off_u16_aes3: # 0x1f5
st.qr q1,a1,64 # [0] q0
j .Lt_0_25090 # [1]
.LBB43_dspi_dotprod_off_u16_aes3: # 0x1fb
srli a3,a6,1 # [0]
l32i a12,a1,96 # [1] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [2] id:788
ee.ld.128.usar.ip q2,a15,16 # [3] id:789
addi a12,a12,-16 # [5]
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:790
beqz.n a3,.Lt_0_27138 # [7]
ld.qr q0,a1,64 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopnez a3,.LBB189_dspi_dotprod_off_u16_aes3 # [3]
.LBB187_dspi_dotprod_off_u16_aes3: # 0x219
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:791
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:792
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:793
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:794
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:795
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:796
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:797
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:798
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:799
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:800
.LBB189_dspi_dotprod_off_u16_aes3: # 0x23f
st.qr q0,a1,64 # [0] q0
j .Lt_0_27138 # [1]
.LBB50_dspi_dotprod_off_u16_aes3: # 0x245
srli a3,a3,2 # [0]
movi.n a13,-16 # [1]
l32i a11,a1,96 # [2] gra_spill_temp_0
addi a15,a15,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a15,a13 # [5] id:801
ee.ld.128.usar.xp q1,a15,a11 # [6] id:802
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:803
ee.ld.128.usar.xp q2,a15,a11 # [9] id:804
beqz.n a3,.Lt_0_28674 # [10]
ld.qr q0,a1,64 # [0] q0
movi.n a10,-16 # [1]
loopnez a3,.LBB212_dspi_dotprod_off_u16_aes3 # [2]
.LBB210_dspi_dotprod_off_u16_aes3: # 0x269
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:805
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:806
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:807
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:808
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:809
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:810
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:811
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:812
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:813
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:814
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:815
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:816
.LBB212_dspi_dotprod_off_u16_aes3: # 0x295
st.qr q0,a1,64 # [0] q0
j .Lt_0_28674 # [1]
.LBB57_dspi_dotprod_off_u16_aes3: # 0x29b
ee.ld.128.usar.ip q1,a15,16 # [0] id:817
ee.ld.128.usar.ip q2,a15,16 # [1] id:818
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:819
beqz.n a3,.Lt_0_30210 # [4]
ld.qr q0,a1,64 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
l32i a12,a1,96 # [3] gra_spill_temp_0
slli a13,a5,1 # [4]
sub a12,a12,a13 # [5]
addi a12,a12,16 # [6]
loopnez a3,.LBB235_dspi_dotprod_off_u16_aes3 # [7]
.LBB233_dspi_dotprod_off_u16_aes3: # 0x2bc
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:820
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:821
ee.vmulas.u16.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:822
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:823
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:824
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:826
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:825
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:827
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:828
.LBB235_dspi_dotprod_off_u16_aes3: # 0x2df
st.qr q0,a1,64 # [0] q0
j .Lt_0_30210 # [1]
.LBB64_dspi_dotprod_off_u16_aes3: # 0x2e5
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i a12,a1,96 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [4] id:829
ee.ld.128.usar.ip q2,a15,16 # [5] id:830
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ld.qr q0,a1,64 # [9] q0
ee.src.q.ld.ip q3,a15,16,q1,q2 # [10] id:831
mov.n a8,a15 # [11]
loopnez a3,.LBB257_dspi_dotprod_off_u16_aes3 # [12]
.LBB255_dspi_dotprod_off_u16_aes3: # 0x306
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:832
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:833
ee.vmulas.u16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:834
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:835
ee.vmulas.u16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:836
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:837
ee.vmulas.u16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:838
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:839
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:840
ee.vmulas.u16.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:841
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:842
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:843
ee.vmulas.u16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:844
ee.vmulas.u16.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:846
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:845
ee.vmulas.u16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:847
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:848
.LBB257_dspi_dotprod_off_u16_aes3: # 0x349
j .Lt_0_33026 # [0]
.Lt_0_35586: # 0x34c
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:875
retw.n # [3]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint16_t *i_data = (uint16_t *)in_image->data;
uint16_t *f_data = (uint16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,104 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_u16_arp4
.global dspi_dotprod_off_u16_ansi
.type dspi_dotprod_off_u16_arp4,@function
// esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, unt16_t offset);
dspi_dotprod_off_u16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_off_u16_arp4_body
j dspi_dotprod_off_u16_ansi
.dspi_dotprod_off_u16_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.16.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.u16 q3, q2, q1
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,407 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_57, 458755
# Program Unit: dspi_dotprod_off_u8_aes3
.type dspi_dotprod_off_u8_aes3, @function
.align 4
.global dspi_dotprod_off_u8_aes3
dspi_dotprod_off_u8_aes3: # 0x4
.LBB1_dspi_dotprod_off_u8_aes3: # 0x4
entry a1,112 #
l32i.n a10,a2,4 # [0] id:745
l32i.n a12,a2,12 # [1] id:744
mull a8,a10,a5 # [2]
blt a12,a8,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
l32i.n a13,a2,8 # [0] id:746
l32i.n a9,a2,16 # [1] id:747
mull a11,a13,a6 # [2]
blt a9,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:749
l32i.n a14,a3,12 # [1] id:748
mull a11,a15,a5 # [2]
blt a14,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:751
l32i.n a9,a3,8 # [1] id:750
s32i a9,a1,72 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB86_dspi_dotprod_off_u8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:752
s32i a8,a1,68 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_35330 # [2]
bne a14,a11,.Lt_0_35330 # [0]
bnei a15,1,.Lt_0_35330 # [0]
l32i a11,a1,72 # [0] gra_spill_temp_2
beqi a11,1,.Lt_0_18946 # [2]
.Lt_0_35330: # 0x46
.Lt_0_19202: # 0x46
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_u8_ansi, @function
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB86_dspi_dotprod_off_u8_aes3: # 0x59
l32r a2,.LC0_1_57 # [0]
retw.n # [1]
.Lt_0_18946: # 0x5e
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_36098 # [1]
addi.n a15,a13,-1 # [0]
bnez a15,.Lt_0_36098 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_36098 # [1]
blti a6,4,.Lt_0_36098 # [0]
movi.n a9,64 # [0]
blt a9,a5,.LBB27_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_36610: # 0x75
.Lt_0_20994: # 0x75
l8ui a9,a1,112 # [0] id:754 offset+0x0
mov.n a8,a1 # [1]
l32i.n a15,a2,0 # [2] id:753
mull a10,a12,a13 # [3]
l32i a2,a1,68 # [4] gra_spill_temp_1
s32i a10,a1,64 # [5] gra_spill_temp_0
movi.n a10,4 # [6]
# loop-count fixed at 4
loop a10,.LBB140_dspi_dotprod_off_u8_aes3 # [7]
.LBB135_dspi_dotprod_off_u8_aes3: # 0x8a
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
addi.n a8,a8,8 # [0*II+8]
.LBB140_dspi_dotprod_off_u8_aes3: # 0xa4
mov.n a3,a6 # [0]
addi a11,a5,-48 # [1]
addi.n a12,a1,8 # [3] temp_offset+8
movi.n a13,0 # [4]
wur.accx_0 a13 # [5]
wur.accx_1 a13 # [6]
ee.vld.128.ip q6,a12,0 # [7] id:756
s32i.n a12,a1,32 # [8] offset_data_ptr
beqz a11,.LBB34_dspi_dotprod_off_u8_aes3 # [9]
l32i a2,a1,68 # [0] gra_spill_temp_1
ee.vld.128.ip q0,a2,16 # [2] id:771
st.qr q0,a1,48 # [3] q0
.Lt_0_24578: # 0xc3
addi a14,a5,-32 # [0]
beqz a14,.LBB43_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_26626: # 0xc9
.Lt_0_26114: # 0xc9
addi a8,a5,-16 # [0]
beqz a8,.LBB50_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_28162: # 0xcf
.Lt_0_27650: # 0xcf
addi a9,a5,-64 # [0]
beqz a9,.LBB57_dspi_dotprod_off_u8_aes3 # [1]
.Lt_0_29698: # 0xd5
.Lt_0_29186: # 0xd5
addi a10,a5,-128 # [0]
beqz a10,.LBB64_dspi_dotprod_off_u8_aes3 # [1]
movi a11,128 # [0]
bge a11,a5,.Lt_0_32514 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
beqz.n a3,.Lt_0_32514 # [5]
ld.qr q0,a1,48 # [0] q0
l32i a14,a1,64 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a5 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_33282: # 0x105
beqz.n a13,.Lt_0_33538 # [0]
loopnez a13,.LBB277_dspi_dotprod_off_u8_aes3 # [0]
.LBB275_dspi_dotprod_off_u8_aes3: # 0x10a
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
ee.vmulas.u8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
ee.vmulas.u8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
.LBB277_dspi_dotprod_off_u8_aes3: # 0x12a
.Lt_0_33538: # 0x12a
ee.vmulas.u8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
movi.n a8,32 # [2]
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
ee.vmulas.u8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
movi.n a9,-16 # [5]
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
addi.n a12,a12,1 # [9]
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
bne a12,a3,.Lt_0_33282 # [12]
.Lt_0_32514: # 0x156
.Lt_0_32258: # 0x156
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
.Lt_0_36098: # 0x172
.Lt_0_20226: # 0x172
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB27_dspi_dotprod_off_u8_aes3: # 0x185
extui a14,a5,0,1 # [0]
beqz a14,.Lt_0_36610 # [1]
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB34_dspi_dotprod_off_u8_aes3: # 0x19e
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
beqz.n a6,.Lt_0_24578 # [4]
movi.n a10,32 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB163_dspi_dotprod_off_u8_aes3 # [4]
.LBB161_dspi_dotprod_off_u8_aes3: # 0x1b6
ee.vmulas.u8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
ee.vmulas.u8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
ee.vmulas.u8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
.LBB163_dspi_dotprod_off_u8_aes3: # 0x1d1
st.qr q1,a1,48 # [0] q0
j .Lt_0_24578 # [1]
.LBB43_dspi_dotprod_off_u8_aes3: # 0x1d7
srli a3,a6,1 # [0]
l32i a12,a1,64 # [1] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
addi a12,a12,-16 # [5]
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
beqz.n a3,.Lt_0_26626 # [7]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopnez a3,.LBB186_dspi_dotprod_off_u8_aes3 # [3]
.LBB184_dspi_dotprod_off_u8_aes3: # 0x1f5
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
.LBB186_dspi_dotprod_off_u8_aes3: # 0x21b
st.qr q0,a1,48 # [0] q0
j .Lt_0_26626 # [1]
.LBB50_dspi_dotprod_off_u8_aes3: # 0x221
srli a3,a3,2 # [0]
movi.n a13,-16 # [1]
l32i a11,a1,64 # [2] gra_spill_temp_0
addi a15,a15,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
beqz.n a3,.Lt_0_28162 # [10]
ld.qr q0,a1,48 # [0] q0
movi.n a10,-16 # [1]
loopnez a3,.LBB209_dspi_dotprod_off_u8_aes3 # [2]
.LBB207_dspi_dotprod_off_u8_aes3: # 0x245
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
.LBB209_dspi_dotprod_off_u8_aes3: # 0x271
st.qr q0,a1,48 # [0] q0
j .Lt_0_28162 # [1]
.LBB57_dspi_dotprod_off_u8_aes3: # 0x277
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
beqz.n a3,.Lt_0_29698 # [4]
ld.qr q0,a1,48 # [0] q0
movi.n a10,32 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
movi.n a11,-16 # [3]
sub a12,a12,a5 # [4]
addi a12,a12,16 # [5]
loopnez a3,.LBB232_dspi_dotprod_off_u8_aes3 # [6]
.LBB230_dspi_dotprod_off_u8_aes3: # 0x295
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
ee.vmulas.u8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
.LBB232_dspi_dotprod_off_u8_aes3: # 0x2b8
st.qr q0,a1,48 # [0] q0
j .Lt_0_29698 # [1]
.LBB64_dspi_dotprod_off_u8_aes3: # 0x2be
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i a12,a1,64 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ld.qr q0,a1,48 # [8] q0
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
mov.n a8,a15 # [10]
loopnez a3,.LBB254_dspi_dotprod_off_u8_aes3 # [11]
.LBB252_dspi_dotprod_off_u8_aes3: # 0x2dc
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
ee.vmulas.u8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
ee.vmulas.u8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
ee.vmulas.u8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
ee.vmulas.u8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
ee.vmulas.u8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
ee.vmulas.u8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
ee.vmulas.u8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
.LBB254_dspi_dotprod_off_u8_aes3: # 0x31f
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:854
retw.n # [10]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint8_t *i_data = (uint8_t *)in_image->data;
uint8_t *f_data = (uint8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,102 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_off_u8_arp4
.global dspi_dotprod_off_u8_ansi
.type dspi_dotprod_off_u8_arp4,@function
// esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
dspi_dotprod_off_u8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// offset - a6
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_off_u8_arp4_body
j dspi_dotprod_off_u8_ansi
.dspi_dotprod_off_u8_arp4_body:
add sp, sp, -16
sw a6, 0(sp)
mv t6, sp
esp.vldbc.8.ip q2, t6, 0
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a7, a5, -1
li t4, 1
sll t4, t4, a7
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q1, t5, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q0, t4, 16 // q1 - f_data
esp.vadd.u8 q3, q2, q1
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,372 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_53, 458755
# Program Unit: dspi_dotprod_s16_aes3
.type dspi_dotprod_s16_aes3, @function
.align 4
.global dspi_dotprod_s16_aes3
dspi_dotprod_s16_aes3: # 0x4
.LBB1_dspi_dotprod_s16_aes3: # 0x4
entry a1,64 #
l32i.n a10,a2,4 # [0] id:678
l32i.n a11,a2,12 # [1] id:677
mull a8,a10,a5 # [2]
blt a11,a8,.LBB81_dspi_dotprod_s16_aes3 # [4]
l32i.n a12,a2,8 # [0] id:679
l32i.n a9,a2,16 # [1] id:680
mull a13,a12,a6 # [2]
blt a9,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:682
l32i.n a14,a3,12 # [1] id:681
mull a13,a15,a5 # [2]
blt a14,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:684
l32i.n a9,a3,8 # [1] id:683
s32i.n a9,a1,24 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB81_dspi_dotprod_s16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:685
s32i.n a8,a1,20 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_34050 # [2]
bne a14,a13,.Lt_0_34050 # [0]
bnei a15,1,.Lt_0_34050 # [0]
l32i.n a9,a1,24 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_18178 # [2]
.Lt_0_34050: # 0x43
.Lt_0_18434: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s16_ansi, @function
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB81_dspi_dotprod_s16_aes3: # 0x56
l32r a2,.LC0_1_53 # [0]
retw.n # [1]
.Lt_0_18178: # 0x5b
addi.n a13,a10,-1 # [0]
bnez a13,.Lt_0_34818 # [1]
addi.n a14,a12,-1 # [0]
bnez a14,.Lt_0_34818 # [1]
extui a15,a5,0,3 # [0]
bnez.n a15,.Lt_0_34818 # [1]
blti a6,4,.Lt_0_34818 # [0]
movi.n a8,32 # [0]
bge a8,a5,.Lt_0_35330 # [1]
extui a9,a5,0,1 # [0]
bnez a9,.LBB28_dspi_dotprod_s16_aes3 # [1]
.Lt_0_35330: # 0x78
.Lt_0_20226: # 0x78
mov.n a3,a6 # [0]
addi a10,a5,-24 # [1]
mull a13,a11,a12 # [2]
l32i.n a15,a1,20 # [3] gra_spill_temp_1
l32i.n a2,a2,0 # [4] id:686
movi.n a14,0 # [5]
wur.sar_byte a14 # [6]
wur.accx_0 a14 # [8]
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:690
slli a13,a13,1 # [11]
s32i.n a13,a1,16 # [12] gra_spill_temp_0
beqz a10,.LBB32_dspi_dotprod_s16_aes3 # [13]
.Lt_0_23298: # 0x99
.Lt_0_22786: # 0x99
addi a8,a5,-16 # [0]
beqz a8,.LBB38_dspi_dotprod_s16_aes3 # [1]
.Lt_0_24834: # 0x9f
.Lt_0_24322: # 0x9f
addi a9,a5,-8 # [0]
beqz a9,.LBB44_dspi_dotprod_s16_aes3 # [1]
.Lt_0_26370: # 0xa5
.Lt_0_25858: # 0xa5
addi a10,a5,-32 # [0]
beqz a10,.LBB50_dspi_dotprod_s16_aes3 # [1]
.Lt_0_27906: # 0xab
.Lt_0_27394: # 0xab
addi a11,a5,-64 # [0]
beqz a11,.LBB56_dspi_dotprod_s16_aes3 # [1]
movi.n a12,64 # [0]
bge a12,a5,.Lt_0_30722 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:762
ee.ld.128.usar.ip q2,a2,16 # [2] id:763
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:764
beqz.n a3,.Lt_0_30722 # [5]
slli a8,a5,1 # [0]
l32i.n a14,a1,16 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a8 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_31490: # 0xd9
addi.n a12,a12,1 # [0]
movi.n a9,32 # [1]
beqz.n a13,.Lt_0_31746 # [2]
loopnez a13,.LBB221_dspi_dotprod_s16_aes3 # [0]
.LBB219_dspi_dotprod_s16_aes3: # 0xe2
ee.vld.128.ip q5,a15,16 # [0*II+0] id:766
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:765
ee.vld.128.ip q0,a15,16 # [0*II+2] id:768
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:767
ee.vld.128.ip q5,a15,16 # [0*II+4] id:770
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:769
ee.vld.128.ip q0,a15,16 # [0*II+6] id:772
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:771
.LBB221_dspi_dotprod_s16_aes3: # 0xfe
.Lt_0_31746: # 0xfe
ee.vmulas.s16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:773
movi.n a10,-16 # [1]
ee.vld.128.ip q0,a15,16 # [2] id:774
ee.vld.128.ip q6,a15,16 # [3] id:776
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:775
ee.vld.128.ip q4,a15,16 # [5] id:779
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:777
ee.ld.128.usar.xp q1,a2,a9 # [7] id:778
ee.vld.128.ip q0,a15,16 # [8] id:781
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:780
bne a12,a3,.Lt_0_31490 # [10]
.Lt_0_30722: # 0x122
.Lt_0_30466: # 0x122
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_33282 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srai a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:787
retw.n # [10]
.Lt_0_34818: # 0x148
.Lt_0_19458: # 0x148
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_s16_aes3: # 0x15b
ee.ld.128.usar.ip q1,a2,16 # [0] id:691
ee.ld.128.usar.ip q2,a2,16 # [1] id:692
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:693
beqz.n a6,.Lt_0_23298 # [4]
addi a12,a13,-32 # [0]
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopgtz a6,.LBB107_dspi_dotprod_s16_aes3 # [3]
.LBB105_dspi_dotprod_s16_aes3: # 0x170
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:694
ee.vld.128.ip q5,a15,16 # [0*II+2] id:697
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:696
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:698
ee.vld.128.ip q0,a15,16 # [0*II+5] id:700
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:699
.LBB107_dspi_dotprod_s16_aes3: # 0x188
j .Lt_0_23298 # [0]
.LBB38_dspi_dotprod_s16_aes3: # 0x18b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:701
ee.ld.128.usar.ip q2,a2,16 # [5] id:702
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:703
loopnez a3,.LBB130_dspi_dotprod_s16_aes3 # [9]
.LBB128_dspi_dotprod_s16_aes3: # 0x1a3
ee.vld.128.ip q4,a15,16 # [0*II+0] id:705
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:704
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:706
ee.vld.128.ip q0,a15,16 # [0*II+3] id:708
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:707
ee.vld.128.ip q5,a15,16 # [0*II+5] id:710
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:709
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:711
ee.vld.128.ip q0,a15,16 # [0*II+8] id:713
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:712
.LBB130_dspi_dotprod_s16_aes3: # 0x1c5
j .Lt_0_24834 # [0]
.LBB44_dspi_dotprod_s16_aes3: # 0x1c8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,16 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:714
ee.ld.128.usar.xp q1,a8,a11 # [6] id:715
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:716
ee.ld.128.usar.xp q2,a8,a11 # [9] id:717
loopnez a3,.LBB153_dspi_dotprod_s16_aes3 # [10]
.LBB151_dspi_dotprod_s16_aes3: # 0x1e4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:719
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:718
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:720
ee.vld.128.ip q0,a15,16 # [0*II+3] id:722
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:721
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:723
ee.vld.128.ip q5,a15,16 # [0*II+6] id:725
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:724
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:726
ee.vld.128.ip q0,a15,16 # [0*II+9] id:728
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:727
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:729
.LBB153_dspi_dotprod_s16_aes3: # 0x20c
mov.n a2,a8 # [0]
j .Lt_0_26370 # [1]
.LBB50_dspi_dotprod_s16_aes3: # 0x211
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:730
ee.ld.128.usar.ip q2,a2,16 # [5] id:731
sub a12,a12,a13 # [6]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:732
addi a12,a12,16 # [9]
loopnez a3,.LBB176_dspi_dotprod_s16_aes3 # [10]
.LBB174_dspi_dotprod_s16_aes3: # 0x22c
ee.vld.128.ip q5,a15,16 # [0*II+0] id:734
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:733
ee.vld.128.ip q1,a15,16 # [0*II+2] id:736
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:735
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:737
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:738
ee.vld.128.ip q0,a15,16 # [0*II+7] id:741
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:740
.LBB176_dspi_dotprod_s16_aes3: # 0x24b
j .Lt_0_27906 # [0]
.LBB56_dspi_dotprod_s16_aes3: # 0x24e
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:742
ee.ld.128.usar.ip q2,a2,16 # [5] id:743
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:744
loopnez a3,.LBB198_dspi_dotprod_s16_aes3 # [10]
.LBB196_dspi_dotprod_s16_aes3: # 0x269
ee.vld.128.ip q4,a15,16 # [0*II+0] id:746
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:745
ee.vld.128.ip q0,a15,16 # [0*II+2] id:748
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:747
ee.vld.128.ip q5,a15,16 # [0*II+4] id:750
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:749
ee.vld.128.ip q6,a15,16 # [0*II+6] id:752
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:751
ee.vld.128.ip q5,a15,16 # [0*II+8] id:754
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:753
ee.vld.128.ip q6,a15,16 # [0*II+10] id:756
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:755
ee.vld.128.ip q5,a15,16 # [0*II+12] id:759
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:757
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:758
ee.vld.128.ip q0,a15,16 # [0*II+15] id:761
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:760
.LBB198_dspi_dotprod_s16_aes3: # 0x2a4
j .Lt_0_30722 # [0]
.Lt_0_33282: # 0x2a7
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:788
retw.n # [3]
.LBB28_dspi_dotprod_s16_aes3: # 0x2b1
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int16_t *i_data = (int16_t *)in_image->data;
int16_t *f_data = (int16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,95 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_s16_arp4
.global dspi_dotprod_s16_ansi
.type dspi_dotprod_s16_arp4,@function
// esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_s16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_s16_arp4_body
j dspi_dotprod_s16_ansi
.dspi_dotprod_s16_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,370 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_52, 458755
# Program Unit: dspi_dotprod_s8_aes3
.type dspi_dotprod_s8_aes3, @function
.align 4
.global dspi_dotprod_s8_aes3
dspi_dotprod_s8_aes3: # 0x4
.LBB1_dspi_dotprod_s8_aes3: # 0x4
entry a1,48 #
l32i.n a10,a2,4 # [0] id:668
l32i.n a11,a2,12 # [1] id:667
mull a8,a10,a5 # [2]
blt a11,a8,.LBB78_dspi_dotprod_s8_aes3 # [4]
l32i.n a12,a2,8 # [0] id:669
l32i.n a9,a2,16 # [1] id:670
mull a13,a12,a6 # [2]
blt a9,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:672
l32i.n a14,a3,12 # [1] id:671
mull a13,a15,a5 # [2]
blt a14,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:674
l32i.n a9,a3,8 # [1] id:673
s32i.n a9,a1,8 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB78_dspi_dotprod_s8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:675
s32i.n a8,a1,4 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_33026 # [2]
bne a14,a13,.Lt_0_33026 # [0]
bnei a15,1,.Lt_0_33026 # [0]
l32i.n a13,a1,8 # [0] gra_spill_temp_2
beqi a13,1,.Lt_0_17666 # [2]
.Lt_0_33026: # 0x43
.Lt_0_17922: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s8_ansi, @function
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB78_dspi_dotprod_s8_aes3: # 0x56
l32r a2,.LC0_1_52 # [0]
retw.n # [1]
.Lt_0_17666: # 0x5b
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_33794 # [1]
addi.n a15,a12,-1 # [0]
bnez a15,.Lt_0_33794 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_33794 # [1]
blti a6,4,.Lt_0_33794 # [0]
movi.n a9,64 # [0]
bge a9,a5,.Lt_0_34306 # [1]
extui a10,a5,0,1 # [0]
bnez a10,.LBB28_dspi_dotprod_s8_aes3 # [1]
.Lt_0_34306: # 0x78
.Lt_0_19714: # 0x78
mov.n a3,a6 # [0]
addi a13,a5,-48 # [1]
movi.n a14,0 # [2]
mull a15,a11,a12 # [3]
l32i.n a2,a2,0 # [4] id:676
s32i.n a15,a1,0 # [6] gra_spill_temp_0
wur.accx_0 a14 # [7]
l32i.n a15,a1,4 # [8] gra_spill_temp_1
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:679
beqz a13,.LBB32_dspi_dotprod_s8_aes3 # [11]
.Lt_0_22786: # 0x93
.Lt_0_22274: # 0x93
addi a8,a5,-32 # [0]
beqz a8,.LBB38_dspi_dotprod_s8_aes3 # [1]
.Lt_0_24322: # 0x99
.Lt_0_23810: # 0x99
addi a9,a5,-16 # [0]
beqz a9,.LBB44_dspi_dotprod_s8_aes3 # [1]
.Lt_0_25858: # 0x9f
.Lt_0_25346: # 0x9f
addi a10,a5,-64 # [0]
beqz a10,.LBB50_dspi_dotprod_s8_aes3 # [1]
.Lt_0_27394: # 0xa5
.Lt_0_26882: # 0xa5
addi a11,a5,-128 # [0]
beqz a11,.LBB56_dspi_dotprod_s8_aes3 # [1]
movi a12,128 # [0]
bge a12,a5,.Lt_0_30210 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:751
ee.ld.128.usar.ip q2,a2,16 # [2] id:752
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:753
beqz.n a3,.Lt_0_30210 # [5]
l32i.n a14,a1,0 # [0] gra_spill_temp_0
addi a13,a5,63 # [1]
movgez a13,a5,a5 # [2]
srai a13,a13,6 # [3]
sub a14,a14,a5 # [4]
addi a14,a14,16 # [5]
addi.n a13,a13,-1 # [6]
.Lt_0_30978: # 0xd1
addi.n a12,a12,1 # [0]
movi.n a8,32 # [1]
movi.n a9,-16 # [2]
beqz.n a13,.Lt_0_31234 # [3]
loopnez a13,.LBB218_dspi_dotprod_s8_aes3 # [0]
.LBB216_dspi_dotprod_s8_aes3: # 0xdc
ee.vld.128.ip q5,a15,16 # [0*II+0] id:755
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:754
ee.vld.128.ip q0,a15,16 # [0*II+2] id:757
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:756
ee.vld.128.ip q5,a15,16 # [0*II+4] id:759
ee.vmulas.s8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:758
ee.vld.128.ip q0,a15,16 # [0*II+6] id:761
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:760
.LBB218_dspi_dotprod_s8_aes3: # 0xf8
.Lt_0_31234: # 0xf8
ee.vmulas.s8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:762
ee.vld.128.ip q0,a15,16 # [1] id:763
ee.vld.128.ip q6,a15,16 # [2] id:765
ee.vmulas.s8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:764
ee.vld.128.ip q4,a15,16 # [4] id:768
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:766
ee.ld.128.usar.xp q1,a2,a8 # [6] id:767
ee.vld.128.ip q0,a15,16 # [7] id:770
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:769
bne a12,a3,.Lt_0_30978 # [9]
.Lt_0_30210: # 0x11a
.Lt_0_29954: # 0x11a
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:772
retw.n # [10]
.Lt_0_33794: # 0x136
.Lt_0_18946: # 0x136
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
#.LBB25_dspi_dotprod_s8_aes3: # 0x145
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_s8_aes3: # 0x149
ee.ld.128.usar.ip q1,a2,16 # [0] id:680
ee.ld.128.usar.ip q2,a2,16 # [1] id:681
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:682
beqz.n a6,.Lt_0_22786 # [4]
movi.n a10,32 # [0]
l32i.n a12,a1,0 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB104_dspi_dotprod_s8_aes3 # [4]
.LBB102_dspi_dotprod_s8_aes3: # 0x160
ee.vld.128.ip q4,a15,16 # [0*II+0] id:684
ee.vmulas.s8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:683
ee.vld.128.ip q5,a15,16 # [0*II+2] id:686
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:685
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:687
ee.vld.128.ip q0,a15,16 # [0*II+5] id:689
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:688
.LBB104_dspi_dotprod_s8_aes3: # 0x178
j .Lt_0_22786 # [0]
.LBB38_dspi_dotprod_s8_aes3: # 0x17b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,0 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:690
ee.ld.128.usar.ip q2,a2,16 # [5] id:691
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:692
loopnez a3,.LBB127_dspi_dotprod_s8_aes3 # [9]
.LBB125_dspi_dotprod_s8_aes3: # 0x193
ee.vld.128.ip q4,a15,16 # [0*II+0] id:694
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:693
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:695
ee.vld.128.ip q0,a15,16 # [0*II+3] id:697
ee.vmulas.s8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:696
ee.vld.128.ip q5,a15,16 # [0*II+5] id:699
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:698
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:700
ee.vld.128.ip q0,a15,16 # [0*II+8] id:702
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:701
.LBB127_dspi_dotprod_s8_aes3: # 0x1b5
j .Lt_0_24322 # [0]
.LBB44_dspi_dotprod_s8_aes3: # 0x1b8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,0 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:703
ee.ld.128.usar.xp q1,a8,a11 # [6] id:704
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:705
ee.ld.128.usar.xp q2,a8,a11 # [9] id:706
loopnez a3,.LBB150_dspi_dotprod_s8_aes3 # [10]
.LBB148_dspi_dotprod_s8_aes3: # 0x1d4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:707
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:709
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:710
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:712
ee.vld.128.ip q5,a15,16 # [0*II+6] id:714
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:713
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:715
ee.vld.128.ip q0,a15,16 # [0*II+9] id:717
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:716
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:718
.LBB150_dspi_dotprod_s8_aes3: # 0x1fc
mov.n a2,a8 # [0]
j .Lt_0_25858 # [1]
.LBB50_dspi_dotprod_s8_aes3: # 0x201
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:719
ee.ld.128.usar.ip q2,a2,16 # [4] id:720
sub a12,a12,a5 # [5]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:721
addi a12,a12,16 # [8]
loopnez a3,.LBB173_dspi_dotprod_s8_aes3 # [9]
.LBB171_dspi_dotprod_s8_aes3: # 0x219
ee.vld.128.ip q5,a15,16 # [0*II+0] id:723
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:722
ee.vld.128.ip q1,a15,16 # [0*II+2] id:725
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:724
ee.vld.128.ip q5,a15,16 # [0*II+4] id:728
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:726
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:727
ee.vld.128.ip q0,a15,16 # [0*II+7] id:730
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:729
.LBB173_dspi_dotprod_s8_aes3: # 0x238
j .Lt_0_27394 # [0]
.LBB56_dspi_dotprod_s8_aes3: # 0x23b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:731
ee.ld.128.usar.ip q2,a2,16 # [4] id:732
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:733
loopnez a3,.LBB195_dspi_dotprod_s8_aes3 # [9]
.LBB193_dspi_dotprod_s8_aes3: # 0x253
ee.vld.128.ip q4,a15,16 # [0*II+0] id:735
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:734
ee.vld.128.ip q0,a15,16 # [0*II+2] id:737
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:736
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
ee.vmulas.s8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:738
ee.vld.128.ip q6,a15,16 # [0*II+6] id:741
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:740
ee.vld.128.ip q5,a15,16 # [0*II+8] id:743
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:742
ee.vld.128.ip q6,a15,16 # [0*II+10] id:745
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:744
ee.vld.128.ip q5,a15,16 # [0*II+12] id:748
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:746
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:747
ee.vld.128.ip q0,a15,16 # [0*II+15] id:750
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:749
.LBB195_dspi_dotprod_s8_aes3: # 0x28e
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
sra a10,a10 # [8]
s8i a10,a4,0 # [9] id:772
retw.n # [10]
.LBB28_dspi_dotprod_s8_aes3: # 0x2aa
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
#.LBB29_dspi_dotprod_s8_aes3: # 0x2b9
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
int8_t *i_data = (int8_t *)in_image->data;
int8_t *f_data = (int8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,93 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_s8_arp4
.global dspi_dotprod_s8_ansi
.type dspi_dotprod_s8_arp4,@function
// esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_s8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_s8_arp4_body
j dspi_dotprod_s8_ansi
.dspi_dotprod_s8_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,371 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_55, 458755
# Program Unit: dspi_dotprod_u16_aes3
.type dspi_dotprod_u16_aes3, @function
.align 4
.global dspi_dotprod_u16_aes3
dspi_dotprod_u16_aes3: # 0x4
.LBB1_dspi_dotprod_u16_aes3: # 0x4
entry a1,64 #
l32i.n a10,a2,4 # [0] id:681
l32i.n a11,a2,12 # [1] id:680
mull a8,a10,a5 # [2]
blt a11,a8,.LBB81_dspi_dotprod_u16_aes3 # [4]
l32i.n a12,a2,8 # [0] id:682
l32i.n a9,a2,16 # [1] id:683
mull a13,a12,a6 # [2]
blt a9,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
l32i.n a15,a3,4 # [0] id:685
l32i.n a14,a3,12 # [1] id:684
mull a13,a15,a5 # [2]
blt a14,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
l32i.n a8,a3,16 # [0] id:687
l32i.n a9,a3,8 # [1] id:686
s32i.n a9,a1,24 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB81_dspi_dotprod_u16_aes3 # [5]
l32i.n a8,a3,0 # [0] id:688
s32i.n a8,a1,20 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_34050 # [2]
bne a14,a13,.Lt_0_34050 # [0]
bnei a15,1,.Lt_0_34050 # [0]
l32i.n a9,a1,24 # [0] gra_spill_temp_2
beqi a9,1,.Lt_0_18178 # [2]
.Lt_0_34050: # 0x43
.Lt_0_18434: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_s16_ansi, @function
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB81_dspi_dotprod_u16_aes3: # 0x56
l32r a2,.LC0_1_55 # [0]
retw.n # [1]
.Lt_0_18178: # 0x5b
addi.n a13,a10,-1 # [0]
bnez a13,.Lt_0_34818 # [1]
addi.n a14,a12,-1 # [0]
bnez a14,.Lt_0_34818 # [1]
extui a15,a5,0,3 # [0]
bnez.n a15,.Lt_0_34818 # [1]
blti a6,4,.Lt_0_34818 # [0]
movi.n a8,32 # [0]
bge a8,a5,.Lt_0_35330 # [1]
extui a9,a5,0,1 # [0]
bnez a9,.LBB28_dspi_dotprod_u16_aes3 # [1]
.Lt_0_35330: # 0x78
.Lt_0_20226: # 0x78
mov.n a3,a6 # [0]
addi a10,a5,-24 # [1]
mull a13,a11,a12 # [2]
l32i.n a15,a1,20 # [3] gra_spill_temp_1
l32i.n a2,a2,0 # [4] id:689
movi.n a14,0 # [5]
wur.sar_byte a14 # [6]
wur.accx_0 a14 # [8]
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:693
slli a13,a13,1 # [11]
s32i.n a13,a1,16 # [12] gra_spill_temp_0
beqz a10,.LBB32_dspi_dotprod_u16_aes3 # [13]
.Lt_0_23298: # 0x99
.Lt_0_22786: # 0x99
addi a8,a5,-16 # [0]
beqz a8,.LBB38_dspi_dotprod_u16_aes3 # [1]
.Lt_0_24834: # 0x9f
.Lt_0_24322: # 0x9f
addi a9,a5,-8 # [0]
beqz a9,.LBB44_dspi_dotprod_u16_aes3 # [1]
.Lt_0_26370: # 0xa5
.Lt_0_25858: # 0xa5
addi a10,a5,-32 # [0]
beqz a10,.LBB50_dspi_dotprod_u16_aes3 # [1]
.Lt_0_27906: # 0xab
.Lt_0_27394: # 0xab
addi a11,a5,-64 # [0]
beqz a11,.LBB56_dspi_dotprod_u16_aes3 # [1]
movi.n a12,64 # [0]
bge a12,a5,.Lt_0_30722 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:765
ee.ld.128.usar.ip q2,a2,16 # [2] id:766
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:767
beqz.n a3,.Lt_0_30722 # [5]
slli a8,a5,1 # [0]
l32i.n a14,a1,16 # [1] gra_spill_temp_0
addi a13,a5,31 # [2]
movgez a13,a5,a5 # [3]
srai a13,a13,5 # [4]
sub a14,a14,a8 # [5]
addi a14,a14,16 # [6]
addi.n a13,a13,-1 # [7]
.Lt_0_31490: # 0xd9
addi.n a12,a12,1 # [0]
movi.n a9,32 # [1]
beqz.n a13,.Lt_0_31746 # [2]
loopnez a13,.LBB221_dspi_dotprod_u16_aes3 # [0]
.LBB219_dspi_dotprod_u16_aes3: # 0xe2
ee.vld.128.ip q5,a15,16 # [0*II+0] id:769
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:768
ee.vld.128.ip q0,a15,16 # [0*II+2] id:771
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:770
ee.vld.128.ip q5,a15,16 # [0*II+4] id:773
ee.vmulas.u16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:772
ee.vld.128.ip q0,a15,16 # [0*II+6] id:775
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:774
.LBB221_dspi_dotprod_u16_aes3: # 0xfe
.Lt_0_31746: # 0xfe
ee.vmulas.u16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:776
movi.n a10,-16 # [1]
ee.vld.128.ip q0,a15,16 # [2] id:777
ee.vld.128.ip q6,a15,16 # [3] id:779
ee.vmulas.u16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:778
ee.vld.128.ip q4,a15,16 # [5] id:782
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:780
ee.ld.128.usar.xp q1,a2,a9 # [7] id:781
ee.vld.128.ip q0,a15,16 # [8] id:784
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:783
bne a12,a3,.Lt_0_31490 # [10]
.Lt_0_30722: # 0x122
.Lt_0_30466: # 0x122
rur.accx_0 a9 # [0]
rur.accx_1 a10 # [1]
blti a7,1,.Lt_0_33282 # [2]
movi.n a2,0 # [0]
addi a13,a7,-33 # [1]
addi.n a14,a7,-1 # [2]
ssr a14 # [3]
sra a12,a10 # [4]
src a11,a10,a9 # [5]
movgez a11,a12,a13 # [6]
addi.n a11,a11,1 # [7]
srli a11,a11,1 # [8]
s16i a11,a4,0 # [9] id:790
retw.n # [10]
.Lt_0_34818: # 0x148
.Lt_0_19458: # 0x148
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_u16_aes3: # 0x15b
ee.ld.128.usar.ip q1,a2,16 # [0] id:694
ee.ld.128.usar.ip q2,a2,16 # [1] id:695
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:696
beqz.n a6,.Lt_0_23298 # [4]
addi a12,a13,-32 # [0]
movi.n a10,32 # [1]
movi.n a11,-16 # [2]
loopgtz a6,.LBB107_dspi_dotprod_u16_aes3 # [3]
.LBB105_dspi_dotprod_u16_aes3: # 0x170
ee.vld.128.ip q4,a15,16 # [0*II+0] id:698
ee.vmulas.u16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:697
ee.vld.128.ip q5,a15,16 # [0*II+2] id:700
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:699
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:701
ee.vld.128.ip q0,a15,16 # [0*II+5] id:703
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:702
.LBB107_dspi_dotprod_u16_aes3: # 0x188
j .Lt_0_23298 # [0]
.LBB38_dspi_dotprod_u16_aes3: # 0x18b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:704
ee.ld.128.usar.ip q2,a2,16 # [5] id:705
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:706
loopnez a3,.LBB130_dspi_dotprod_u16_aes3 # [9]
.LBB128_dspi_dotprod_u16_aes3: # 0x1a3
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:707
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:709
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
ee.vmulas.u16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:710
ee.vld.128.ip q5,a15,16 # [0*II+5] id:713
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:712
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:714
ee.vld.128.ip q0,a15,16 # [0*II+8] id:716
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:715
.LBB130_dspi_dotprod_u16_aes3: # 0x1c5
j .Lt_0_24834 # [0]
.LBB44_dspi_dotprod_u16_aes3: # 0x1c8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,16 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:717
ee.ld.128.usar.xp q1,a8,a11 # [6] id:718
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:719
ee.ld.128.usar.xp q2,a8,a11 # [9] id:720
loopnez a3,.LBB153_dspi_dotprod_u16_aes3 # [10]
.LBB151_dspi_dotprod_u16_aes3: # 0x1e4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:722
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:721
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:723
ee.vld.128.ip q0,a15,16 # [0*II+3] id:725
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:724
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:726
ee.vld.128.ip q5,a15,16 # [0*II+6] id:728
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:727
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:729
ee.vld.128.ip q0,a15,16 # [0*II+9] id:731
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:730
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:732
.LBB153_dspi_dotprod_u16_aes3: # 0x20c
mov.n a2,a8 # [0]
j .Lt_0_26370 # [1]
.LBB50_dspi_dotprod_u16_aes3: # 0x211
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:733
ee.ld.128.usar.ip q2,a2,16 # [5] id:734
sub a12,a12,a13 # [6]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:735
addi a12,a12,16 # [9]
loopnez a3,.LBB176_dspi_dotprod_u16_aes3 # [10]
.LBB174_dspi_dotprod_u16_aes3: # 0x22c
ee.vld.128.ip q5,a15,16 # [0*II+0] id:737
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:736
ee.vld.128.ip q1,a15,16 # [0*II+2] id:739
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:738
ee.vld.128.ip q5,a15,16 # [0*II+4] id:742
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:740
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:741
ee.vld.128.ip q0,a15,16 # [0*II+7] id:744
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:743
.LBB176_dspi_dotprod_u16_aes3: # 0x24b
j .Lt_0_27906 # [0]
.LBB56_dspi_dotprod_u16_aes3: # 0x24e
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
slli a13,a5,1 # [2]
l32i.n a12,a1,16 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:745
ee.ld.128.usar.ip q2,a2,16 # [5] id:746
sub a12,a12,a13 # [7]
addi a12,a12,16 # [8]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:747
loopnez a3,.LBB198_dspi_dotprod_u16_aes3 # [10]
.LBB196_dspi_dotprod_u16_aes3: # 0x269
ee.vld.128.ip q4,a15,16 # [0*II+0] id:749
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:748
ee.vld.128.ip q0,a15,16 # [0*II+2] id:751
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:750
ee.vld.128.ip q5,a15,16 # [0*II+4] id:753
ee.vmulas.u16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:752
ee.vld.128.ip q6,a15,16 # [0*II+6] id:755
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:754
ee.vld.128.ip q5,a15,16 # [0*II+8] id:757
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:756
ee.vld.128.ip q6,a15,16 # [0*II+10] id:759
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:758
ee.vld.128.ip q5,a15,16 # [0*II+12] id:762
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:760
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:761
ee.vld.128.ip q0,a15,16 # [0*II+15] id:764
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:763
.LBB198_dspi_dotprod_u16_aes3: # 0x2a4
j .Lt_0_30722 # [0]
.Lt_0_33282: # 0x2a7
movi.n a2,0 # [0]
sext a14,a9,15 # [1]
s16i a14,a4,0 # [2] id:791
retw.n # [3]
.LBB28_dspi_dotprod_u16_aes3: # 0x2b1
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint16_t *i_data = (uint16_t *)in_image->data;
uint16_t *f_data = (uint16_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int64_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,95 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_u16_arp4
.global dspi_dotprod_u16_ansi
.type dspi_dotprod_u16_arp4,@function
// esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_u16_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 7
or t1, t1, t2
beqz t1, .dspi_dotprod_u16_arp4_body
j dspi_dotprod_u16_ansi
.dspi_dotprod_u16_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
slli t2, t2, 1 // i_step = i_step<<1
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
slli t3, t3, 1 // f_step = f_step<<1
srli t6, a3, 3 // t5 = len/8
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,367 @@
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_aes3_enabled == 1)
.text
.align 4
.literal .LC0_1_52, 458755
.type dspi_dotprod_u8_aes3, @function
.align 4
.global dspi_dotprod_u8_aes3
dspi_dotprod_u8_aes3: # 0x4
.LBB1_dspi_dotprod_u8_aes3: # 0x4
entry a1,48 #
l32i.n a10,a2,4 # [0] id:669
l32i.n a11,a2,12 # [1] id:668
mull a8,a10,a5 # [2]
blt a11,a8,.LBB78_dspi_dotprod_u8_aes3 # [4]
l32i.n a12,a2,8 # [0] id:670
l32i.n a9,a2,16 # [1] id:671
mull a13,a12,a6 # [2]
blt a9,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
l32i.n a15,a3,4 # [0] id:673
l32i.n a14,a3,12 # [1] id:672
mull a13,a15,a5 # [2]
blt a14,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
l32i.n a8,a3,16 # [0] id:675
l32i.n a9,a3,8 # [1] id:674
s32i.n a9,a1,8 # [2] gra_spill_temp_2
mull a9,a9,a6 # [3]
blt a8,a9,.LBB78_dspi_dotprod_u8_aes3 # [5]
l32i.n a8,a3,0 # [0] id:676
s32i.n a8,a1,4 # [1] gra_spill_temp_1
bbsi a8,0,.Lt_0_33026 # [2]
bne a14,a13,.Lt_0_33026 # [0]
bnei a15,1,.Lt_0_33026 # [0]
l32i.n a13,a1,8 # [0] gra_spill_temp_2
beqi a13,1,.Lt_0_17666 # [2]
.Lt_0_33026: # 0x43
.Lt_0_17922: # 0x43
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
.type dspi_dotprod_u8_ansi, @function
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB78_dspi_dotprod_u8_aes3: # 0x56
l32r a2,.LC0_1_52 # [0]
retw.n # [1]
.Lt_0_17666: # 0x5b
addi.n a14,a10,-1 # [0]
bnez a14,.Lt_0_33794 # [1]
addi.n a15,a12,-1 # [0]
bnez a15,.Lt_0_33794 # [1]
extui a8,a5,0,4 # [0]
bnez.n a8,.Lt_0_33794 # [1]
blti a6,4,.Lt_0_33794 # [0]
movi.n a9,64 # [0]
bge a9,a5,.Lt_0_34306 # [1]
extui a10,a5,0,1 # [0]
bnez a10,.LBB28_dspi_dotprod_u8_aes3 # [1]
.Lt_0_34306: # 0x78
.Lt_0_19714: # 0x78
mov.n a3,a6 # [0]
addi a13,a5,-48 # [1]
movi.n a14,0 # [2]
mull a15,a11,a12 # [3]
l32i.n a2,a2,0 # [4] id:677
s32i.n a15,a1,0 # [6] gra_spill_temp_0
wur.accx_0 a14 # [7]
l32i.n a15,a1,4 # [8] gra_spill_temp_1
wur.accx_1 a14 # [9]
ee.vld.128.ip q0,a15,16 # [10] id:680
beqz a13,.LBB32_dspi_dotprod_u8_aes3 # [11]
.Lt_0_22786: # 0x93
.Lt_0_22274: # 0x93
addi a8,a5,-32 # [0]
beqz a8,.LBB38_dspi_dotprod_u8_aes3 # [1]
.Lt_0_24322: # 0x99
.Lt_0_23810: # 0x99
addi a9,a5,-16 # [0]
beqz a9,.LBB44_dspi_dotprod_u8_aes3 # [1]
.Lt_0_25858: # 0x9f
.Lt_0_25346: # 0x9f
addi a10,a5,-64 # [0]
beqz a10,.LBB50_dspi_dotprod_u8_aes3 # [1]
.Lt_0_27394: # 0xa5
.Lt_0_26882: # 0xa5
addi a11,a5,-128 # [0]
beqz a11,.LBB56_dspi_dotprod_u8_aes3 # [1]
movi a12,128 # [0]
bge a12,a5,.Lt_0_30210 # [1]
movi.n a12,0 # [0]
ee.ld.128.usar.ip q1,a2,16 # [1] id:752
ee.ld.128.usar.ip q2,a2,16 # [2] id:753
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:754
beqz.n a3,.Lt_0_30210 # [5]
l32i.n a14,a1,0 # [0] gra_spill_temp_0
addi a13,a5,31 # [1]
movgez a13,a5,a5 # [2]
srai a13,a13,5 # [3]
sub a14,a14,a5 # [4]
addi a14,a14,16 # [5]
addi.n a13,a13,-1 # [6]
.Lt_0_30978: # 0xd1
addi.n a12,a12,1 # [0]
movi.n a8,32 # [1]
movi.n a9,-16 # [2]
beqz.n a13,.Lt_0_31234 # [3]
loopnez a13,.LBB218_dspi_dotprod_u8_aes3 # [0]
.LBB216_dspi_dotprod_u8_aes3: # 0xdc
ee.vld.128.ip q5,a15,16 # [0*II+0] id:756
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:755
ee.vld.128.ip q0,a15,16 # [0*II+2] id:758
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:757
ee.vld.128.ip q5,a15,16 # [0*II+4] id:760
ee.vmulas.u8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:759
ee.vld.128.ip q0,a15,16 # [0*II+6] id:762
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:761
.LBB218_dspi_dotprod_u8_aes3: # 0xf8
.Lt_0_31234: # 0xf8
ee.vmulas.u8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:763
ee.vld.128.ip q0,a15,16 # [1] id:764
ee.vld.128.ip q6,a15,16 # [2] id:766
ee.vmulas.u8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:765
ee.vld.128.ip q4,a15,16 # [4] id:769
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:767
ee.ld.128.usar.xp q1,a2,a8 # [6] id:768
ee.vld.128.ip q0,a15,16 # [7] id:771
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:770
bne a12,a3,.Lt_0_30978 # [9]
.Lt_0_30210: # 0x11a
.Lt_0_29954: # 0x11a
movi.n a2,0 # [0]
rur.accx_0 a10 # [1]
addi.n a12,a7,-1 # [2]
movi.n a11,1 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
srl a10,a10 # [8]
s8i a10,a4,0 # [9] id:773
retw.n # [10]
.Lt_0_33794: # 0x136
.Lt_0_18946: # 0x136
mov.n a10,a2 # [0]
mov.n a11,a3 # [1]
mov.n a12,a4 # [2]
mov.n a13,a5 # [3]
mov.n a14,a6 # [4]
mov.n a15,a7 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
.LBB32_dspi_dotprod_u8_aes3: # 0x149
ee.ld.128.usar.ip q1,a2,16 # [0] id:681
ee.ld.128.usar.ip q2,a2,16 # [1] id:682
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:683
beqz.n a6,.Lt_0_22786 # [4]
movi.n a10,32 # [0]
l32i.n a12,a1,0 # [1] gra_spill_temp_0
movi.n a11,-16 # [2]
addi a12,a12,-32 # [3]
loopgtz a6,.LBB104_dspi_dotprod_u8_aes3 # [4]
.LBB102_dspi_dotprod_u8_aes3: # 0x160
ee.vld.128.ip q4,a15,16 # [0*II+0] id:685
ee.vmulas.u8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:684
ee.vld.128.ip q5,a15,16 # [0*II+2] id:687
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:686
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:688
ee.vld.128.ip q0,a15,16 # [0*II+5] id:690
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:689
.LBB104_dspi_dotprod_u8_aes3: # 0x178
j .Lt_0_22786 # [0]
.LBB38_dspi_dotprod_u8_aes3: # 0x17b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
srli a3,a6,1 # [2]
l32i.n a12,a1,0 # [3] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [4] id:691
ee.ld.128.usar.ip q2,a2,16 # [5] id:692
addi a12,a12,-16 # [7]
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:693
loopnez a3,.LBB127_dspi_dotprod_u8_aes3 # [9]
.LBB125_dspi_dotprod_u8_aes3: # 0x193
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:694
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:696
ee.vld.128.ip q0,a15,16 # [0*II+3] id:698
ee.vmulas.u8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:697
ee.vld.128.ip q5,a15,16 # [0*II+5] id:700
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:699
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:701
ee.vld.128.ip q0,a15,16 # [0*II+8] id:703
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:702
.LBB127_dspi_dotprod_u8_aes3: # 0x1b5
j .Lt_0_24322 # [0]
.LBB44_dspi_dotprod_u8_aes3: # 0x1b8
srli a3,a3,2 # [0]
movi.n a10,-16 # [1]
l32i.n a11,a1,0 # [2] gra_spill_temp_0
addi a8,a2,16 # [3]
addi a11,a11,16 # [4]
ee.ld.128.usar.xp q2,a8,a10 # [5] id:704
ee.ld.128.usar.xp q1,a8,a11 # [6] id:705
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:706
ee.ld.128.usar.xp q2,a8,a11 # [9] id:707
loopnez a3,.LBB150_dspi_dotprod_u8_aes3 # [10]
.LBB148_dspi_dotprod_u8_aes3: # 0x1d4
ee.vld.128.ip q4,a15,16 # [0*II+0] id:709
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:708
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:710
ee.vld.128.ip q0,a15,16 # [0*II+3] id:712
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:711
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:713
ee.vld.128.ip q5,a15,16 # [0*II+6] id:715
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:714
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:716
ee.vld.128.ip q0,a15,16 # [0*II+9] id:718
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:717
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:719
.LBB150_dspi_dotprod_u8_aes3: # 0x1fc
mov.n a2,a8 # [0]
j .Lt_0_25858 # [1]
.LBB50_dspi_dotprod_u8_aes3: # 0x201
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:720
ee.ld.128.usar.ip q2,a2,16 # [4] id:721
sub a12,a12,a5 # [5]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:722
addi a12,a12,16 # [8]
loopnez a3,.LBB173_dspi_dotprod_u8_aes3 # [9]
.LBB171_dspi_dotprod_u8_aes3: # 0x219
ee.vld.128.ip q5,a15,16 # [0*II+0] id:724
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:723
ee.vld.128.ip q1,a15,16 # [0*II+2] id:726
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:725
ee.vld.128.ip q5,a15,16 # [0*II+4] id:729
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:727
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:728
ee.vld.128.ip q0,a15,16 # [0*II+7] id:731
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:730
.LBB173_dspi_dotprod_u8_aes3: # 0x238
j .Lt_0_27394 # [0]
.LBB56_dspi_dotprod_u8_aes3: # 0x23b
movi.n a10,32 # [0]
movi.n a11,-16 # [1]
l32i.n a12,a1,0 # [2] gra_spill_temp_0
ee.ld.128.usar.ip q1,a2,16 # [3] id:732
ee.ld.128.usar.ip q2,a2,16 # [4] id:733
sub a12,a12,a5 # [6]
addi a12,a12,16 # [7]
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:734
loopnez a3,.LBB195_dspi_dotprod_u8_aes3 # [9]
.LBB193_dspi_dotprod_u8_aes3: # 0x253
ee.vld.128.ip q4,a15,16 # [0*II+0] id:736
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:735
ee.vld.128.ip q0,a15,16 # [0*II+2] id:738
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:737
ee.vld.128.ip q5,a15,16 # [0*II+4] id:740
ee.vmulas.u8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:739
ee.vld.128.ip q6,a15,16 # [0*II+6] id:742
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:741
ee.vld.128.ip q5,a15,16 # [0*II+8] id:744
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:743
ee.vld.128.ip q6,a15,16 # [0*II+10] id:746
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:745
ee.vld.128.ip q5,a15,16 # [0*II+12] id:749
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:747
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:748
ee.vld.128.ip q0,a15,16 # [0*II+15] id:751
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:750
.LBB195_dspi_dotprod_u8_aes3: # 0x28e
movi.n a2,0 # [0]
movi.n a11,1 # [1]
addi.n a12,a7,-1 # [2]
rur.accx_0 a10 # [3]
ssl a12 # [4]
sll a11,a11 # [5]
ssr a7 # [6]
add.n a10,a10,a11 # [7]
srl a10,a10 # [8]
s8i a10,a4,0 # [9] id:773
retw.n # [10]
.LBB28_dspi_dotprod_u8_aes3: # 0x2aa
mov.n a15,a7 # [0]
mov.n a14,a6 # [1]
mov.n a13,a5 # [2]
mov.n a12,a4 # [3]
mov.n a11,a3 # [4]
mov.n a10,a2 # [5]
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
mov.n a2,a10 # [0]
retw.n # [1]
#endif // dsps_dotprod_s16_aes3_enabled

View File

@@ -0,0 +1,49 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
uint8_t *i_data = (uint8_t *)in_image->data;
uint8_t *f_data = (uint8_t *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
int32_t acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
acc += 1 << (shift - 1); // round operation
acc >>= shift;
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,93 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod_platform.h"
#if (dspi_dotprod_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dspi_dotprod_u8_arp4
.global dspi_dotprod_u8_ansi
.type dspi_dotprod_u8_arp4,@function
// esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
dspi_dotprod_u8_arp4:
// in_image - a0
// filter - a1
// out_value - a2
// count_x - a3
// count_y - a4
// shift - a5
// i_data - t0
// f_data - t1
// i_step - t2
// f_step - t3
// t4 - current i_data
// t5 - current f_data
lw t1, 4(a0) // load in_image->step_x
lw t2, 4(a1) // load filter->step_x
or t1, t1, t2
addi t1, t1, -1 // should be 0 now
andi t2, a3, 15
or t1, t1, t2
beqz t1, .dspi_dotprod_u8_arp4_body
j dspi_dotprod_u8_ansi
.dspi_dotprod_u8_arp4_body:
add sp, sp, -16
lw t0, 0(a0) // i_data
lw t1, 0(a1) // f_data
lw t2, 8(a0) // step_y
lw t4, 12(a0) // stride_x
mul t2, t4, t2
lw t3, 8(a1) // step_y
lw t5, 12(a1) // stride_x
mul t3, t5, t3
srli t6, a3, 4 // t5 = len/16
addi a6, a5, -1
li t4, 1
sll t4, t4, a6
esp.zero.xacc
esp.movx.w.xacc.l t4
.loop_count_y:
mv t4, t0
mv t5, t1
esp.vld.128.ip q0, t4, 16 // q0 - i_data
esp.lp.setup 0, t6, .loop_count_x
esp.vld.128.ip q1, t5, 16 // q1 - f_data
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
add t0, t0, t2
add t1, t1, t3
add a4,a4, -1
bgtz a4, .loop_count_y
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dspi_dotprod_arp4_enabled

View File

@@ -0,0 +1,80 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_s16_ae32_enabled == 1)
#include "dsps_dotprod_s16_m_ae32.S"
#include "dsp_err_codes.h"
.text
.align 4
.global dsps_dotprod_s16_ae32
.type dsps_dotprod_s16_ae32,@function
//esp_err_t dsps_dotprod_s16_ae32(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
dsps_dotprod_s16_ae32:
// src1 - a2
// src2 - a3
// dest - a4
// len - a5
// shift - a6
entry a1, 16
// Check minimum length
movi a8, 4
blt a5, a8, dsps_dotprod_s16_ae32_error
// Clear accumulator
movi a8, 0
wsr a8, acchi
// Prepare and load round value
movi a8, 0x7fff
ssr a6
srl a8, a8
wsr a8, acclo // initialize acc with shifted round value
// Compensate for pre-increment
// Right shift to 16 bits
// RS = -shift + 15
neg a6, a6
addi a6, a6, 15
/* number of loop iterations (see below):
* a7 = count / 4 - 1
*/
srli a7, a5, 2
addi a7, a7, -1
movi.n a10, 0 // load 0 to the a10 to increment second array
dotprod_s16_ae32_full a2, a3, a7, a5
/* Get accumulator */
ssr a6
rsr a2, acchi
rsr a3, acclo
src a2, a2, a3
s16i a2, a4, 0
movi.n a2, 0
retw.n
dsps_dotprod_s16_ae32_error:
movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
retw.n
#endif // dsps_dotprod_s16_ae32_enabled

View File

@@ -0,0 +1,33 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod.h"
esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift)
{
// To make correct round operation we have to shift round value
long long acc = 0x7fff >> shift;
for (int i = 0 ; i < len ; i++) {
acc += (int32_t)src1[i] * (int32_t)src2[i];
}
int final_shift = shift - 15;
if (final_shift > 0) {
*dest = (acc << final_shift);
} else {
*dest = (acc >> (-final_shift));
}
return ESP_OK;
}

View File

@@ -0,0 +1,74 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_s16_arp4_enabled == 1)
#include "dsp_err_codes.h"
.text
.align 4
.global dsps_dotprod_s16_arp4
.global dsps_dotprod_s16_ansi
.type dsps_dotprod_s16_arp4,@function
//esp_err_t dsps_dotprod_s16_arp4(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
dsps_dotprod_s16_arp4:
// src1 - a0
// src2 - a1
// dest - a2
// len - a3
// shift - a4
andi a5, a3, 7
beqz a5, .dsps_dotprod_s16_arp4_body
j dsps_dotprod_s16_ansi
.dsps_dotprod_s16_arp4_body:
add sp,sp,-16
// Enable analigned data access
esp.movx.r.cfg t6
or t6, t6, 2
esp.movx.w.cfg t6
add t6, a4, -15
neg t6, t6 // t6 - real_shift
li t3, 0x7fff
srl t3, t3, a4
esp.zero.xacc
esp.movx.w.xacc.l t3
mv t3, a0
mv t4, a1
esp.vld.128.ip q0, t3, 16 //q0 - src1
srli t5, a3, 3 // t5 = len>>3
# esp.lp.setup 0, t5, .main_loop
# esp.vld.128.ip q1, t4, 16 // q1 - src1
# .main_loop: esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
.main_loop:
esp.vld.128.ip q1, t4, 16 // q1 - src1
esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
add t5, t5, -1
bgtz t5, .main_loop
esp.srs.s.xacc t5, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
sh t5, 0(a2) // store result to output buffer
li a0,0
add sp,sp,16
ret
#endif // dsps_dotprod_s16_ae32_enabled

View File

@@ -0,0 +1,104 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.macro dotprod_s16_ae32 x1, x2, count
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
// x1 - input array1 register (for example a2)
// x2 - input array2 register (for example a3)
// count - counter register (for example a7)
// count - samples_count / 4 - 1
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
// acchi, and acclo have to be initialize before
// Result - acchi || acclo
// Modifies:
// m0, m1, m2, m3
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
/*
* Data schedule. Each line represents instruction, columns represent
* register contents. Last column (MUL) shows the multiplication which
* takes place. Values loaded in the given cycle are shown in square brackets.
*
* m0 m1 m2 m3 MUL
* --------- pre-load ------------
*[x0 x1] (no MULs in the first 3 instructions)
* x0 x1 [y0 y1]
* x0 x1 [x2 x3] y0 y1
* x0 x1 x2 x3 y0 y1 [y2 y3] x0*y0
* ---------- loop -------------- (the following 4 instructions are
*[x4 x5] x2 x3 y0 y1 y2 y3 x1*y1 repeated as much as needed)
* x4 x5 x2 x3 [y4 y5] y2 y3 x2*y2
* x4 x5 [x6 x7] y4 y5 y2 y3 x3*y3
* x4 x5 x6 x7 y4 y5 [y6 y7] x4*y4
* --------- finalize ------------
* x4 x5 x6 x7 y4 y5 y6 y7 x5*y5 (nothing is load)
* x4 x5 x6 x7 y4 y5 y6 y7 x6*y6
* x4 x5 x6 x7 y4 y5 y6 y7 x7*y7
*/
addi \x1, \x1, -4 // To arrange fist pointer
addi \x2, \x2, -4 // To arrange fist pointer
//lddec m0, \x1
//lddec m2, \x2 // To arrange fist pointer
ldinc m0, \x1
ldinc m2, \x2
ldinc m1, \x1
mula.dd.ll.ldinc m3, \x2, m0, m2
loopnez \count, .loop_end
.loop:
mula.dd.hh.ldinc m0, \x1, m0, m2
mula.dd.ll.ldinc m2, \x2, m1, m3
mula.dd.hh.ldinc m1, \x1, m1, m3
mula.dd.ll.ldinc m3, \x2, m0, m2
.loop_end:
mula.dd.hh m0, m2
mula.dd.ll m1, m3
mula.dd.hh m1, m3
.endm // dotprod_s16_ae32
.macro dotprod_s16_ae32_full x1, x2, count, full_count
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
// x1 - input array1 register (for example a2)
// x2 - input array2 register (for example a3)
// count - counter register (for example a7)
// count - samples_count / 4 - 1
// full_count - samples_count
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
// acchi, and acclo have to be initialize before
// Result - acchi || acclo
// Modifies:
// m0, m1, m2, m3
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
dotprod_s16_ae32 \x1, \x2, \count
bbci \full_count, 1, .mod2chk
ldinc m0, \x1
ldinc m2, \x2
mula.dd.hh m0, m2
mula.dd.ll m0, m2
.mod2chk:
bbci \full_count, 0, .mod1chk
ldinc m0, \x1
ldinc m2, \x2
mula.dd.ll m0, m2
.mod1chk:
.endm // dotprod_s16_ae32_full

View File

@@ -0,0 +1,47 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
float *i_data = (float *)in_image->data;
float *f_data = (float *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
float acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += i_data[in_image->step_x * x] * f_data[filter->step_x * x];
}
i_data += i_step;
f_data += f_step;
}
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,47 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspi_dotprod.h"
esp_err_t dspi_dotprod_off_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y, float offset)
{
if (in_image->step_x * count_x > in_image->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (in_image->step_y * count_y > in_image->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_x * count_x > filter->stride_x) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (filter->step_y * count_y > filter->stride_y) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
float *i_data = (float *)in_image->data;
float *f_data = (float *)filter->data;
int i_step = in_image->stride_x * in_image->step_y;
int f_step = filter->stride_x * filter->step_y;
float acc = 0;
for (int y = 0; y < count_y; y++) {
for (int x = 0; x < count_x; x++) {
acc += i_data[in_image->step_x * x] * (f_data[filter->step_x * x] + offset);
}
i_data += i_step;
f_data += f_step;
}
*out_value = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,62 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dotprod_f32_ae32_enabled == 1)
#include "dsps_dotprod_f32_m_ae32.S"
// This is dot product function for ESP32 processor.
.text
.align 4
.global dsps_dotprod_f32_ae32
.global .dsps_dotprod_f32_ae32_body
.type dsps_dotprod_f32_ae32,@function
// The function implements the following C code:
//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
//{
// float acc = 0;
// for (int i=0 ; i< len ; i++)
// {
// acc += src1[i]*src2[i];
// }
// *dest = acc;
// return ESP_OK;
//}
dsps_dotprod_f32_ae32:
// src1 - a2
// src2 - a3
// dest - a4
// len - a5
entry a1, 16
.dsps_dotprod_f32_ae32_body:
// Array increment for floating point data should be 4
movi.n a8, 4
// Clear initial state of the result register
movi.n a9, 0
wfr f1, a9
// a2 - input1
// a3 - input2
// a5 - length
// a8 - 4, step in arrays
dotprod_f32_ae32 a2, a3, a5, a9, a8;
ssi f1, a4, 0 // Store result from f1 to memory at a4
movi.n a2, 0 // return status ESP_OK
retw.n
#endif // dotprode_f32_ae32_enabled

View File

@@ -0,0 +1,85 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_f32_aes3_enabled == 1)
// This is dot product function for ESP32 processor.
.text
.align 4
.global dsps_dotprod_f32_aes3
.global .dsps_dotprod_f32_ae32_body
.type dsps_dotprod_f32_aes3,@function
// The function implements the following C code:
//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
//{
// float acc = 0;
// for (int i=0 ; i< len ; i++)
// {
// acc += src1[i]*src2[i];
// }
// *dest = acc;
// return ESP_OK;
//}
dsps_dotprod_f32_aes3:
// src1 - a2
// src2 - a3
// dest - a4
// len - a5
entry a1, 16
// Check length and align
movi.n a10, 3
and a10, a10, a5
movi.n a9, 15
or a11, a3, a2
and a11, a9, a11
or a10, a10, a11
beqz a10, .dsps_dotprod_f32_aes3_body
// Call Esp32 function
J .dsps_dotprod_f32_ae32_body
.dsps_dotprod_f32_aes3_body:
// Clear initial state of the result register
movi.n a9, 0
wfr f0, a9
wfr f1, a9
wfr f2, a9
wfr f3, a9
// a2 - input1
// a3 - input2
// a5 - length
srli a6, a5, 2 // N count
// lsx f0, a2, a9
loopnez a6, .loop_mac_end_m_ae32
EE.LDF.128.IP f11, f10, f9, f8, a2, 16
EE.LDF.128.IP f7, f6, f5, f4, a3, 16
madd.s f0, f4, f8 // f0 = X11*Y11
madd.s f1, f5, f9 // f1 = X12*Y11
madd.s f2, f6, f10 // f2 = X13*Y11
madd.s f3, f7, f11 // f3 = X14*Y11
.loop_mac_end_m_ae32:
add.s f0, f0, f1
add.s f0, f0, f2
add.s f0, f0, f3
ssi f0, a4, 0 // Store result from f1 to memory at a4
movi.n a2, 0 // return status ESP_OK
retw.n
#endif // dotprode_f32_ae32_enabled

View File

@@ -0,0 +1,25 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod.h"
esp_err_t dsps_dotprod_f32_ansi(const float *src1, const float *src2, float *dest, int len)
{
float acc = 0;
for (int i = 0 ; i < len ; i++) {
acc += src1[i] * src2[i];
}
*dest = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,77 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_f32_arp4_enabled == 1)
.text
.align 4
.global dsps_dotprod_f32_arp4
.type dsps_dotprod_f32_arp4,@function
// The function implements the following C code:
//esp_err_t dsps_dotprod_f32(const float* src1, const float* src2, float* dest, int len)
//{
// float acc = 0;
// for (int i=0 ; i< len ; i++)
// {
// acc += src1[i]*src2[i];
// }
// *dest = acc;
// return ESP_OK;
//}
dsps_dotprod_f32_arp4:
// src1 - a0
// src2 - a1
// dest - a2
// len - a3
add sp,sp,-16
fmv.w.x fa2,zero
flw fa0, 0(a0)
flw fa1, 0(a1)
add a0, a0, 4
add a1, a1, 4
li a4, 2
ble a3, a4, .loop_less_2
// Loop when len > 2
esp.lp.setup 0, a3, .dotprod_loop
fmadd.s fa2, fa0, fa1, fa2
flw fa0, 0(a0)
flw fa1, 0(a1)
add a0, a0, 4
.dotprod_loop: add a1, a1, 4
fsw fa2, 0(a2)
add sp,sp,16
li a0,0
ret
// Loop when len <=2
.loop_less_2:
fmadd.s fa2, fa0, fa1, fa2
flw fa0, 0(a0)
flw fa1, 0(a1)
add a0, a0, 4
add a1, a1, 4
add a3, a3, -1
bnez a3, .loop_less_2
fsw fa2, 0(a2)
add sp,sp,16
li a0,0
ret
#endif // dotprode_f32_arp4_enabled

View File

@@ -0,0 +1,42 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.macro dotprod_f32_ae32 x1 x2 count step1 step2
// This macro calculates floating point dot product for count float samples
// x1, x2 - input arrays
// count - amount of samples
// step1 - start step
//,step2 - A register for array step increment. (should be divided by 4)
// f1 - contains initial value
//
// result in f1
//
// Macros body:
// f1 += x1[i*step1]*x2[i*step2]; i: 0..counter-1
// affected: f0, f1, f2
// Example: dotprod_f32_ae32 a2 a3 a5 a8 a9
// a8 == 4, step is 4 bytes
// a5 == 32, length of array is 32
//
// mov \step1, \step2
lsx f0, \x2, \step1
// sub \x1, \x1, \step1 // To compensate first increment
loopnez \count, .loop_mac_end_m_ae32
lsx f2, \x1, \step1
madd.s f1, f2, f0
add.n \step1, \step1, \step2
lsx f0, \x2, \step1
.loop_mac_end_m_ae32:
.endm

View File

@@ -0,0 +1,64 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dotprode_f32_ae32_enabled == 1)
#include "dsps_dotprode_f32_m_ae32.S"
// This is dot product function for ESP32 processor.
.text
.align 4
.global dsps_dotprode_f32_ae32
.type dsps_dotprode_f32_ae32,@function
// The function implements the following C code:
//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
//{
// float acc = 0;
// for (int i=0 ; i< len ; i++)
// {
// acc += src1[i]*src2[i];
// }
// *dest = acc;
// return ESP_OK;
//}
dsps_dotprode_f32_ae32:
// src1 - a2
// src2 - a3
// dest - a4
// len - a5
// step1- a6
// step2- a7
entry a1, 16
// Array increment for floating point data should be 4
slli a6,a6, 2
slli a7,a7, 2
// Clear initial state of the result register
movi.n a9, 0
wfr f1, a9
// a2 - input1
// a3 - input2
// a5 - length
// a6,a7, step in arrays
dotprode_f32_ae32 a2, a3, a5, a6, a7;
ssi f1, a4, 0 // Store result from f1 to memory at a4
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //dotprode_f32_ae32_enabled

View File

@@ -0,0 +1,25 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod.h"
esp_err_t dsps_dotprode_f32_ansi(const float *src1, const float *src2, float *dest, int len, int step1, int step2)
{
float acc = 0;
for (int i = 0 ; i < len ; i++) {
acc += src1[i * step1] * src2[i * step2];
}
*dest = acc;
return ESP_OK;
}

View File

@@ -0,0 +1,78 @@
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod_platform.h"
#if (dsps_dotprod_f32_arp4_enabled == 1)
.text
.align 4
.global dsps_dotprode_f32_arp4
.type dsps_dotprode_f32_arp4,@function
// The function implements the following C code:
//esp_err_t dsps_dotprode_f32(const float *src1, const float *src2, float *dest, int len, int step1, int step2)
//{
// float acc = 0;
// for (int i = 0 ; i < len ; i++) {
// acc += src1[i * step1] * src2[i * step2];
// }
// *dest = acc;
// return ESP_OK;
//}
dsps_dotprode_f32_arp4:
// src1 - a0
// src2 - a1
// dest - a2
// len - a3
add sp,sp,-16
fmv.w.x fa2,zero
slli a4, a4, 2 // step address increment by 4
slli a5, a5, 2 // step address increment by 4
flw fa0, 0(a0)
flw fa1, 0(a1)
add a0, a0, a4
add a1, a1, a5
li a6, 2
ble a3, a6, .loop_less_2
// Loop when len > 2
esp.lp.setup 0, a3, .dotprod_loop
fmadd.s fa2, fa0, fa1, fa2
flw fa0, 0(a0)
flw fa1, 0(a1)
add a0, a0, a4
.dotprod_loop: add a1, a1, a5
fsw fa2, 0(a2)
add sp,sp,16
li a0,0
ret
// Loop when len <=2
.loop_less_2:
fmadd.s fa2, fa0, fa1, fa2
flw fa0, 0(a0)
flw fa1, 0(a1)
add a0, a0, a4
add a1, a1, a5
add a3, a3, -1
bnez a3, .loop_less_2
fsw fa2, 0(a2)
add sp,sp,16
li a0,0
ret
#endif // dotprode_f32_arp4_enabled

View File

@@ -0,0 +1,41 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.macro dotprode_f32_ae32 x1 x2 count step1 step2
// This macro calculates floating point dot product for count float samples
// x1, x2 - input arrays
// count - amount of samples
// step1,step2 - A register for array step. (should be divided by 4)
// f1 - contains initial value
//
// result in f1
//
// Macros body:
// f1 += x1[i*step1]*x2[i*step2]; i: 0..counter-1
// affected: f0, f1, f2
// Example: dotprod_f32_ae32 a2 a3 a5 a8 a9
// a8 == 4, step is 4 bytes
// a5 == 32, length of array is 32
//
lsi f0, \x2, 0
sub \x1, \x1, \step1 // To compensate first increment
loopnez \count, .loop_mace_end_m_ae32
add.n \x1, \x1, \step1
lsi f2, \x1, 0
madd.s f1, f2, f0
add.n \x2, \x2, \step2
lsi f0, \x2, 0
.loop_mace_end_m_ae32:
.endm

View File

@@ -0,0 +1,191 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _dspi_dotprod_H_
#define _dspi_dotprod_H_
#include "esp_log.h"
#include "dsp_err.h"
#include "dsp_types.h"
#include "dspi_dotprod_platform.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**@{*/
/**
* @brief dot product of two images
* Dot product calculation for two floating point images: *out_value += image[i*...] * src2[i*...]); i= [0..count_x*count_y)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] in_image descriptor of the image
* @param[in] filter descriptor of the filter
* @param[out] out_value pointer to the output value
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dspi_dotprod_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y);
/**@}*/
/**@{*/
/**
* @brief dot product of two images
* Dot product calculation for two floating point images: *out_value += image[i*...] * src2[i*...]); i= [0..count_x*count_y)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] in_image descriptor of the image
* @param[in] filter descriptor of the filter
* @param[out] out_value pointer to the output value
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
* @param[in] shift - result shift to right, by default must be 15 for int16_t or 7 for int8_t
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_s16_aes3(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_u16_aes3(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_s8_aes3(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_u8_aes3(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
/**@}*/
/**@{*/
/**
* @brief dot product of two images with input offset
* Dot product calculation for two floating point images: *out_value += (image[i*...] + offset) * src2[i*...]); i= [0..count_x*count_y)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] in_image descriptor of the image
* @param[in] filter descriptor of the filter
* @param[out] out_value pointer to the output value
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
* @param[in] offset - input offset value.
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dspi_dotprod_off_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y, float offset);
/**@}*/
/**@{*/
/**
* @brief dot product of two images with input offset
* Dot product calculation for two floating point images: *out_value += (image[i*...] + offset) * src2[i*...]); i= [0..count_x*count_y)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] in_image descriptor of the image
* @param[in] filter descriptor of the filter
* @param[out] out_value pointer to the output value
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
* @param[in] shift - result shift to right, by default must be 15 for int16_t or 7 for int8_t
* @param[in] offset - input offset value.
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
esp_err_t dspi_dotprod_off_s16_aes3(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
esp_err_t dspi_dotprod_off_u16_aes3(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
esp_err_t dspi_dotprod_off_s8_aes3(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
esp_err_t dspi_dotprod_off_u8_aes3(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
/**@}*/
#ifdef __cplusplus
}
#endif
#ifdef CONFIG_DSP_OPTIMIZED
#define dspi_dotprod_f32 dspi_dotprod_f32_ansi
#define dspi_dotprod_off_f32 dspi_dotprod_off_f32_ansi
#if (dspi_dotprod_aes3_enabled == 1)
#define dspi_dotprod_s16 dspi_dotprod_s16_aes3
#define dspi_dotprod_u16 dspi_dotprod_u16_aes3
#define dspi_dotprod_s8 dspi_dotprod_s8_aes3
#define dspi_dotprod_u8 dspi_dotprod_u8_aes3
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_aes3
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_aes3
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_aes3
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_aes3
#elif (dspi_dotprod_arp4_enabled == 1)
#define dspi_dotprod_s16 dspi_dotprod_s16_arp4
#define dspi_dotprod_s8 dspi_dotprod_s8_arp4
#define dspi_dotprod_u16 dspi_dotprod_u16_arp4
#define dspi_dotprod_u8 dspi_dotprod_u8_arp4
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_arp4
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_arp4
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_arp4
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_arp4
#else
#define dspi_dotprod_s16 dspi_dotprod_s16_ansi
#define dspi_dotprod_s8 dspi_dotprod_s8_ansi
#define dspi_dotprod_u16 dspi_dotprod_u16_ansi
#define dspi_dotprod_u8 dspi_dotprod_u8_ansi
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_ansi
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_ansi
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_ansi
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_ansi
#endif
#endif
#ifdef CONFIG_DSP_ANSI
#define dspi_dotprod_f32 dspi_dotprod_f32_ansi
#define dspi_dotprod_off_f32 dspi_dotprod_off_f32_ansi
#define dspi_dotprod_s16 dspi_dotprod_s16_ansi
#define dspi_dotprod_s8 dspi_dotprod_s8_ansi
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_ansi
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_ansi
#define dspi_dotprod_u16 dspi_dotprod_u16_ansi
#define dspi_dotprod_u8 dspi_dotprod_u8_ansi
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_ansi
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_ansi
#endif
#endif // _dspi_dotprod_H_

View File

@@ -0,0 +1,24 @@
#ifndef _dspi_dotprod_platform_H_
#define _dspi_dotprod_platform_H_
#include "sdkconfig.h"
#ifdef __XTENSA__
#include <xtensa/config/core-isa.h>
#include <xtensa/config/core-matmap.h>
#if CONFIG_IDF_TARGET_ESP32S3
#define dspi_dotprod_aes3_enabled 1
#endif
#endif // __XTENSA__
#if CONFIG_IDF_TARGET_ESP32P4
#ifdef CONFIG_DSP_OPTIMIZED
#define dspi_dotprod_arp4_enabled 1
#else
#define dspi_dotprod_arp4_enabled 0
#endif
#endif
#endif // _dspi_dotprod_platform_H_

View File

@@ -0,0 +1,128 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _DSPI_DOTPROD_H_
#define _DSPI_DOTPROD_H_
#include "esp_log.h"
#include "dsp_err.h"
#include "dsps_dotprod_platform.h"
#ifdef __cplusplus
extern "C"
{
#endif
// These functions calculates dotproduct of two vectors.
/**@{*/
/**
* @brief dot product of two 16 bit vectors
* Dot product calculation for two signed 16 bit arrays: *dest += (src1[i] * src2[i]) >> (15-shift); i= [0..N)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] src1 source array 1
* @param[in] src2 source array 2
* @param dest destination pointer
* @param[in] len length of input arrays
* @param[in] shift shift of the result.
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
esp_err_t dsps_dotprod_s16_ae32(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
esp_err_t dsps_dotprod_s16_arp4(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
/**@}*/
/**@{*/
/**
* @brief dot product of two float vectors
* Dot product calculation for two floating point arrays: *dest += (src1[i] * src2[i]); i= [0..N)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] src1 source array 1
* @param[in] src2 source array 2
* @param dest destination pointer
* @param[in] len length of input arrays
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_dotprod_f32_ansi(const float *src1, const float *src2, float *dest, int len);
esp_err_t dsps_dotprod_f32_ae32(const float *src1, const float *src2, float *dest, int len);
esp_err_t dsps_dotprod_f32_aes3(const float *src1, const float *src2, float *dest, int len);
esp_err_t dsps_dotprod_f32_arp4(const float *src1, const float *src2, float *dest, int len);
/**@}*/
/**@{*/
/**
* @brief dot product of two float vectors with step
* Dot product calculation for two floating point arrays: *dest += (src1[i*step1] * src2[i*step2]); i= [0..N)
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
* The extension (_ae32) is optimized for ESP32 chip.
*
* @param[in] src1 source array 1
* @param[in] src2 source array 2
* @param dest destination pointer
* @param[in] len length of input arrays
* @param[in] step1 step over elements in first array
* @param[in] step2 step over elements in second array
* @return
* - ESP_OK on success
* - One of the error codes from DSP library
*/
esp_err_t dsps_dotprode_f32_ansi(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
esp_err_t dsps_dotprode_f32_ae32(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
esp_err_t dsps_dotprode_f32_arp4(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
/**@}*/
#ifdef __cplusplus
}
#endif
#if CONFIG_DSP_OPTIMIZED
#if (dsps_dotprod_s16_ae32_enabled == 1)
#define dsps_dotprod_s16 dsps_dotprod_s16_ae32
#elif (dsps_dotprod_s16_arp4_enabled == 1)
#define dsps_dotprod_s16 dsps_dotprod_s16_arp4
#else
#define dsps_dotprod_s16 dsps_dotprod_s16_ansi
#endif // dsps_dotprod_s16_ae32_enabled
#if (dsps_dotprod_f32_aes3_enabled == 1)
#define dsps_dotprod_f32 dsps_dotprod_f32_aes3
#define dsps_dotprode_f32 dsps_dotprode_f32_ae32
#elif (dsps_dotprod_f32_arp4_enabled == 1)
#define dsps_dotprod_f32 dsps_dotprod_f32_arp4
#define dsps_dotprode_f32 dsps_dotprode_f32_arp4
#elif (dotprod_f32_ae32_enabled == 1)
#define dsps_dotprod_f32 dsps_dotprod_f32_ae32
#define dsps_dotprode_f32 dsps_dotprode_f32_ae32
#else
#define dsps_dotprod_f32 dsps_dotprod_f32_ansi
#define dsps_dotprode_f32 dsps_dotprode_f32_ansi
#endif // dsps_dotprod_f32_ae32_enabled
#else // CONFIG_DSP_OPTIMIZED
#define dsps_dotprod_s16 dsps_dotprod_s16_ansi
#define dsps_dotprod_f32 dsps_dotprod_f32_ansi
#define dsps_dotprode_f32 dsps_dotprode_f32_ansi
#endif // CONFIG_DSP_OPTIMIZED
#endif // _DSPI_DOTPROD_H_

View File

@@ -0,0 +1,42 @@
#ifndef _dsps_dotprod_platform_H_
#define _dsps_dotprod_platform_H_
#include "sdkconfig.h"
#ifdef __XTENSA__
#include <xtensa/config/core-isa.h>
#include <xtensa/config/core-matmap.h>
#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
#define dotprod_f32_ae32_enabled 1
#define dotprode_f32_ae32_enabled 1
#endif //
#if ((XCHAL_HAVE_LOOPS == 1) && (XCHAL_HAVE_MAC16 == 1))
#define dsps_dotprod_s16_ae32_enabled 1
#endif //
#endif // __XTENSA__
#if CONFIG_IDF_TARGET_ESP32S3
#define dsps_dotprod_s16_aes3_enabled 1
#define dsps_dotprod_f32_aes3_enabled 1
#endif
#if CONFIG_IDF_TARGET_ESP32P4
#ifdef CONFIG_DSP_OPTIMIZED
#define dsps_dotprod_s16_arp4_enabled 1
#define dsps_dotprod_f32_arp4_enabled 1
#else
#define dsps_dotprod_s16_arp4_enabled 0
#define dsps_dotprod_f32_arp4_enabled 0
#endif // CONFIG_DSP_OPTIMIZED
#endif
#endif // _dsps_dotprod_platform_H_

View File

@@ -0,0 +1,167 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "esp_dsp.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dsps_dotprod.h"
#include "dsp_tests.h"
TEST_CASE("dsps_dotprod_f32_aexx functionality", "[dsps]")
{
float check_value = 1235;
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
z[0] = check_value;
z[2] = check_value + 1;
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_f32(x, y, &z[1], i);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(0, z[1]);
}
for (int i = 0 ; i < max_N ; i++) {
x[i] = 1;
y[i] = 3;
}
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_f32(x, y, &z[1], i);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(i * 3, z[1]);
}
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprod_f32_aexx benchmark", "[dsps]")
{
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
printf("Benchmark dsps_dotprod_f32_aexx - x=%8.8"PRIx32", y=%8.8"PRIx32", len=%8.8x\n", (uint32_t)x, (uint32_t)y, 1024);
unsigned int start_b = dsp_get_cpu_cycle_count();
int repeat_count = 1024;
for (int i = 0 ; i < repeat_count ; i++) {
dsps_dotprod_f32(x, y, &z[1], 1024);
}
unsigned int end_b = dsp_get_cpu_cycle_count();
float total_b = end_b - start_b;
float cycles = total_b / (repeat_count);
printf("Benchmark dsps_dotprod_f32_aexx - %f per 1024 samples + overhead.\n", cycles);
float min_exec = 1024;
float max_exec = 6 * 1024;
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprod_f32_ansi functionality", "[dsps]")
{
float check_value = 1235;
int max_N = 1024;
float *x = (float *)malloc(max_N * sizeof(float));
float *y = (float *)malloc(max_N * sizeof(float));
float *z = (float *)malloc(max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
z[0] = check_value;
z[2] = check_value + 1;
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_f32_ansi(x, y, &z[1], i);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(0, z[1]);
}
for (int i = 0 ; i < max_N ; i++) {
x[i] = 1;
y[i] = 3;
}
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_f32_ansi(x, y, &z[1], i);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(i * 3, z[1]);
}
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprod_f32_ansi benchmark", "[dsps]")
{
int max_N = 1024;
float *x = (float *)malloc(max_N * sizeof(float));
float *y = (float *)malloc(max_N * sizeof(float));
float *z = (float *)malloc(max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
unsigned int start_b = dsp_get_cpu_cycle_count();
int repeat_count = 1024;
for (int i = 0 ; i < repeat_count ; i++) {
dsps_dotprod_f32_ansi(x, y, &z[1], 1024);
}
unsigned int end_b = dsp_get_cpu_cycle_count();
float total_b = end_b - start_b;
float cycles = total_b / (repeat_count);
printf("Benchmark dsps_dotprod_f32_ansi - %f per sample + overhead.\n", cycles);
float min_exec = 1024;
float max_exec = 20 * 1024;
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,216 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dsps_dotprod.h"
#include "dsp_tests.h"
// Test dsps_dotprod_s16_ansi function
TEST_CASE("dsps_dotprod_s16_ansi functionality", "[dsps]")
{
int16_t check_value = 1235;
int max_N = 1024;
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
z[0] = check_value;
z[2] = check_value + 1;
// Check result == 0
for (int i = 4; i < 1024; i++) {
esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, 0);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(0, z[1]);
}
int16_t val_x = 0x080;
int16_t val_y = 0x100;
int16_t val_shift = 0;
for (int i = 0; i < max_N; i++) {
x[i] = val_x;
y[i] = val_y;
}
// We check that dotproduct working with shift = 0;
for (int i = 4 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, val_shift);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL((i * (val_x * val_y) + (0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
}
val_shift = 2;
for (int i = 4 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, val_shift);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(((long long)i * ((long long)val_x * (long long)val_y) + ((long long)0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
}
free(x);
free(y);
free(z);
}
// Test dsps_dotprod_s16_ansi function
TEST_CASE("dsps_dotprod_s16_aexx functionality", "[dsps]")
{
int16_t check_value = 1235;
int max_N = 1024;
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
z[0] = check_value;
z[2] = check_value + 1;
// Check result == 0
for (int i = 4 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, 0);
{
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(0, z[1]);
}
}
int16_t val_x = 0x080;
int16_t val_y = 0x100;
int16_t val_shift = 0;
for (int i = 0 ; i < max_N ; i++) {
x[i] = val_x;
y[i] = val_y;
}
// We check that dotproduct working with shift = 0;
for (int i = 4 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, val_shift);
{
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL((i * (val_x * val_y) + (0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
}
}
val_shift = 2;
for (int i = 4 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, val_shift);
{
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL((i * (val_x * val_y) + ((int)0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
}
}
free(x);
free(y);
free(z);
}
static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
TEST_CASE("dsps_dotprod_s16 benchmark", "[dsps]")
{
int max_N = 1024;
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0x100;
y[i] = 0x200;
}
// Disable interrupt to get exect count
portENTER_CRITICAL(&testnlock);
unsigned int start_b = dsp_get_cpu_cycle_count();
int repeat_count = 1024;
for (int i = 0 ; i < repeat_count ; i++) {
dsps_dotprod_s16(x, y, &z[1], 1024, 0);
}
unsigned int end_b = dsp_get_cpu_cycle_count();
portEXIT_CRITICAL(&testnlock);
float total_b = end_b - start_b;
float cycles = total_b / (repeat_count);
printf("Benchmark dsps_dotprod_s16 - %f cycles for 1024 samples + overhead. Result = %08x\n", cycles, z[1]);
float min_exec = 256;
float max_exec = 8 * 1024;
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprod_s16_ansi benchmark", "[dsps]")
{
int max_N = 1024;
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0x100;
y[i] = 0x200;
}
// Disable interrupt to get exect count
portENTER_CRITICAL(&testnlock);
unsigned int start_b = dsp_get_cpu_cycle_count();
int repeat_count = 1024;
for (int i = 0 ; i < repeat_count ; i++) {
dsps_dotprod_s16_ansi(x, y, &z[1], 1024, 0);
}
unsigned int end_b = dsp_get_cpu_cycle_count();
portEXIT_CRITICAL(&testnlock);
float total_b = end_b - start_b;
float cycles = total_b / (repeat_count);
printf("Benchmark dsps_dotprod_s16 - %f cycles for 1024 samples + overhead. Result = %08x\n", cycles, z[1]);
float min_exec = 1024 * 10;
float max_exec = 1024 * 30;
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,165 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dsps_dotprod.h"
#include "dsp_tests.h"
TEST_CASE("dsps_dotprode_f32 functionality", "[dsps]")
{
float check_value = 1235;
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
z[0] = check_value;
z[2] = check_value + 1;
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprode_f32(x, y, &z[1], i, 1, 1);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(0, z[1]);
}
for (int i = 0 ; i < max_N ; i++) {
x[i] = 1;
y[i] = 3;
}
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprode_f32(x, y, &z[1], i, 1, 1);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(i * 3, z[1]);
}
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprode_f32 benchmark", "[dsps]")
{
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
unsigned int start_b = dsp_get_cpu_cycle_count();
int repeat_count = 1024;
for (int i = 0 ; i < repeat_count ; i++) {
dsps_dotprode_f32(x, y, &z[1], 1024, 1, 1);
}
unsigned int end_b = dsp_get_cpu_cycle_count();
float total_b = end_b - start_b;
float cycles = total_b / (repeat_count);
printf("Benchmark dsps_dotprode_f32_aexx - %f per 1024 samples + overhead.\n", cycles);
float min_exec = 1024;
float max_exec = 6 * 1024;
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprode_f32_ansi functionality", "[dsps]")
{
float check_value = 1235;
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
z[0] = check_value;
z[2] = check_value + 1;
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprode_f32_ansi(x, y, &z[1], i, 1, 1);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(0, z[1]);
}
for (int i = 0 ; i < max_N ; i++) {
x[i] = 1;
y[i] = 3;
}
for (int i = 1 ; i < 1024 ; i++) {
esp_err_t status = dsps_dotprode_f32_ansi(x, y, &z[1], i, 1, 1);
TEST_ASSERT_EQUAL(status, ESP_OK);
TEST_ASSERT_EQUAL(check_value, z[0]);
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
TEST_ASSERT_EQUAL(i * 3, z[1]);
}
free(x);
free(y);
free(z);
}
TEST_CASE("dsps_dotprode_f32_ansi benchmark", "[dsps]")
{
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (int i = 0 ; i < max_N ; i++) {
x[i] = 0;
y[i] = 1000;
}
unsigned int start_b = dsp_get_cpu_cycle_count();
int repeat_count = 1024;
for (int i = 0 ; i < repeat_count ; i++) {
dsps_dotprode_f32_ansi(x, y, &z[1], 1024, 1, 1);
}
unsigned int end_b = dsp_get_cpu_cycle_count();
float total_b = end_b - start_b;
float cycles = total_b / (1024 * repeat_count);
printf("Benchmark dsps_dotprode_f32_ansi - %f per sample + overhead.\n", cycles);
float min_exec = 5;
float max_exec = 25;
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,67 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_f32_ansi";
TEST_CASE("dspi_dotprod_f32_ansi functionality", "[dspi]")
{
float check_value1 = 336;
float check_value2 = 480;
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 1;
y[i] = i % 8 + 1;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
float result = -1;
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
ESP_LOGI(TAG, "result 1 = %f", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
ESP_LOGI(TAG, "result 2 = %f", result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
ESP_LOGI(TAG, "result 3 = %f", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
ESP_LOGI(TAG, "result 4 = %f", result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,68 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_f32_ansi";
TEST_CASE("dspi_dotprod_off_f32_ansi functionality", "[dspi]")
{
float check_value1 = 976;
float check_value2 = 1280;
float offset = 10;
int max_N = 1024;
float *x = (float *)memalign(16, max_N * sizeof(float));
float *y = (float *)memalign(16, max_N * sizeof(float));
float *z = (float *)memalign(16, max_N * sizeof(float));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 1;
y[i] = i % 8 + 1;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
float result = -1;
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
ESP_LOGI(TAG, "result 1 = %f", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
ESP_LOGI(TAG, "result 2 = %f", result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
ESP_LOGI(TAG, "result 3 = %f", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
ESP_LOGI(TAG, "result 4 = %f", result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,107 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_s16";
TEST_CASE("dspi_dotprod_off_s16_aexx functionality", "[dspi]")
{
int shift = 2;
int16_t offset = 7;
int max_N = 8192;
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s16 8x8");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s16(&image1, &image2, &result, 8, 8, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 8, 8, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s16 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s16(&image1, &image2, &result, 16, 16, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s16 24x24");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s16(&image1, &image2, &result, 24, 24, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 24, 24, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s16 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s16(&image1, &image2, &result, 32, 32, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_off_s16 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,69 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_s16_ansi";
TEST_CASE("dspi_dotprod_off_s16_ansi functionality", "[dspi]")
{
int16_t check_value1 = 8676;
int16_t check_value2 = 8742;
int shift = 7;
int16_t offset = 11;
int max_N = 1024;
int16_t *x = (int16_t *)malloc(max_N * sizeof(int16_t));
int16_t *y = (int16_t *)malloc(max_N * sizeof(int16_t));
int16_t *z = (int16_t *)malloc(max_N * sizeof(int16_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 255;
y[i] = i % 8 + 255;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
int16_t result = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,123 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_s8";
TEST_CASE("dspi_dotprod_off_s8_aexx functionality", "[dspi]")
{
int shift = 2;
int8_t offset = 5;
int max_N = 16384;
int8_t *x = (int8_t *)memalign(16, (max_N) * sizeof(int8_t));
int8_t *y = (int8_t *)memalign(16, (max_N) * sizeof(int8_t));
int8_t *z = (int8_t *)memalign(16, max_N * sizeof(int8_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s8 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s8(&image1, &image2, &result, 16, 16, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s8 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s8(&image1, &image2, &result, 32, 32, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s8 48x48");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s8(&image1, &image2, &result, 48, 48, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 48, 48, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s8 64x64");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s8(&image1, &image2, &result, 64, 64, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 64, 64, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_s8 128x128");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_s8(&image1, &image2, &result, 16, 16, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_off_s8 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,70 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_s8_ansi";
TEST_CASE("dspi_dotprod_off_s8_ansi functionality", "[dspi]")
{
int8_t check_value1 = 98;
int8_t check_value2 = 106;
int shift = 7;
int8_t offset = 11;
int max_N = 1024;
int8_t *x = (int8_t *)malloc(max_N * sizeof(int8_t));
int8_t *y = (int8_t *)malloc(max_N * sizeof(int8_t));
int8_t *z = (int8_t *)malloc(max_N * sizeof(int8_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 20;
y[i] = i % 8 + 20;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
int8_t result = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,107 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_u16";
TEST_CASE("dspi_dotprod_off_u16_aexx functionality", "[dspi]")
{
int shift = 2;
uint16_t offset = 7;
int max_N = 8192;
uint16_t *x = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
uint16_t *y = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
uint16_t *z = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u16 8x8");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u16(&image1, &image2, &result, 8, 8, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 8, 8, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u16 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u16(&image1, &image2, &result, 16, 16, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u16 24x24");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u16(&image1, &image2, &result, 24, 24, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 24, 24, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u16 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u16(&image1, &image2, &result, 32, 32, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_off_u16 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,70 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_u16_ansi";
TEST_CASE("dspi_dotprod_off_u16_ansi functionality", "[dspi]")
{
uint16_t check_value1 = 8676;
uint16_t check_value2 = 8742;
int shift = 7;
uint16_t offset = 11;
int max_N = 1024;
uint16_t *x = (uint16_t *)malloc(max_N * sizeof(uint16_t));
uint16_t *y = (uint16_t *)malloc(max_N * sizeof(uint16_t));
uint16_t *z = (uint16_t *)malloc(max_N * sizeof(uint16_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 255;
y[i] = i % 8 + 255;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
uint16_t result = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,122 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_u8";
TEST_CASE("dspi_dotprod_off_u8_aexx functionality", "[dspi]")
{
int shift = 2;
uint8_t offset = 7;
int max_N = 16384;
uint8_t *x = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
uint8_t *y = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
uint8_t *z = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u8 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u8(&image1, &image2, &result, 16, 16, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u8 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u8(&image1, &image2, &result, 32, 32, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u8 48x48");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u8(&image1, &image2, &result, 48, 48, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 48, 48, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u8 64x64");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u8(&image1, &image2, &result, 64, 64, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 64, 64, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_off_u8 128x128");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_off_u8(&image1, &image2, &result, 16, 16, shift, offset);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_off_u8 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,70 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_off_u8_ansi";
TEST_CASE("dspi_dotprod_off_u8_ansi functionality", "[dspi]")
{
uint8_t check_value1 = 98;
uint8_t check_value2 = 106;
int shift = 7;
uint8_t offset = 11;
int max_N = 1024;
uint8_t *x = (uint8_t *)malloc(max_N * sizeof(uint8_t));
uint8_t *y = (uint8_t *)malloc(max_N * sizeof(uint8_t));
uint8_t *z = (uint8_t *)malloc(max_N * sizeof(uint8_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 20;
y[i] = i % 8 + 20;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
uint8_t result = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,106 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_s16";
TEST_CASE("dspi_dotprod_s16_aexx functionality", "[dspi]")
{
int shift = 2;
int max_N = 8192;
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_s16 8x8");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s16(&image1, &image2, &result, 8, 8, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 8, 8, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s16 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s16(&image1, &image2, &result, 16, 16, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 16, 16, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s16 24x24");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s16(&image1, &image2, &result, 24, 24, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 24, 24, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s16 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
int16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s16(&image1, &image2, &result, 32, 32, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int16_t result_ref = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 32, 32, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_s16 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,68 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_s16_ansi";
TEST_CASE("dspi_dotprod_s16_ansi functionality", "[dspi]")
{
int16_t check_value1 = 8321;
int16_t check_value2 = 8386;
int shift = 7;
int max_N = 1024;
int16_t *x = (int16_t *)malloc(max_N * sizeof(int16_t));
int16_t *y = (int16_t *)malloc(max_N * sizeof(int16_t));
int16_t *z = (int16_t *)malloc(max_N * sizeof(int16_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 255;
y[i] = i % 8 + 255;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
int16_t result = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,121 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_s8";
TEST_CASE("dspi_dotprod_s8_aexx functionality", "[dspi]")
{
int shift = 2;
int max_N = 16384;
int8_t *x = (int8_t *)memalign(16, max_N * sizeof(int8_t));
int8_t *y = (int8_t *)memalign(16, max_N * sizeof(int8_t));
int8_t *z = (int8_t *)memalign(16, max_N * sizeof(int8_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_s8 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s8(&image1, &image2, &result, 16, 16, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s8 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s8(&image1, &image2, &result, 32, 32, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 32, 32, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s8 48x48");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s8(&image1, &image2, &result, 48, 48, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 48, 48, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s8 64x64");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s8(&image1, &image2, &result, 64, 64, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 64, 64, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_s8 128x128");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
int8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_s8(&image1, &image2, &result, 16, 16, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
int8_t result_ref = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_s8 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,68 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "";
TEST_CASE("dspi_dotprod_s8_ansi functionality", "[dspi]")
{
int8_t check_value1 = 67;
int8_t check_value2 = 73;
int shift = 7;
int max_N = 1024;
int8_t *x = (int8_t *)malloc(max_N * sizeof(int8_t));
int8_t *y = (int8_t *)malloc(max_N * sizeof(int8_t));
int8_t *z = (int8_t *)malloc(max_N * sizeof(int8_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 20;
y[i] = i % 8 + 20;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
int8_t result = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,106 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_u16";
TEST_CASE("dspi_dotprod_u16_aexx functionality", "[dspi]")
{
int shift = 2;
int max_N = 8192;
uint16_t *x = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
uint16_t *y = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
uint16_t *z = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_u16 8x8");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u16(&image1, &image2, &result, 8, 8, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", (end_b - start_b));
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 8, 8, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u16 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u16(&image1, &image2, &result, 16, 16, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 16, 16, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u16 24x24");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u16(&image1, &image2, &result, 24, 24, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 24, 24, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u16 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
uint16_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u16(&image1, &image2, &result, 32, 32, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint16_t result_ref = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 32, 32, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_u16 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,68 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_u16_ansi";
TEST_CASE("dspi_dotprod_u16_ansi functionality", "[dspi]")
{
uint16_t check_value1 = 8321;
uint16_t check_value2 = 8386;
int shift = 7;
int max_N = 1024;
uint16_t *x = (uint16_t *)malloc(max_N * sizeof(uint16_t));
uint16_t *y = (uint16_t *)malloc(max_N * sizeof(uint16_t));
uint16_t *z = (uint16_t *)malloc(max_N * sizeof(uint16_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 255;
y[i] = i % 8 + 255;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
uint16_t result = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,121 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include <malloc.h>
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "dspi_dotprod_u8";
TEST_CASE("dspi_dotprod_u8_aexx functionality", "[dspi]")
{
int shift = 2;
int max_N = 16384;
uint8_t *x = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
uint8_t *y = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
uint8_t *z = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
for (size_t i = 0; i < max_N; i++) {
x[i] = i % 7;
y[i] = i % 7;
z[i] = 0;
}
{
ESP_LOGI(TAG, "dspi_dotprod_u8 16x16");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u8(&image1, &image2, &result, 16, 16, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u8 32x32");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u8(&image1, &image2, &result, 32, 32, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 32, 32, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u8 48x48");
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u8(&image1, &image2, &result, 48, 48, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 48, 48, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u8 64x64");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u8(&image1, &image2, &result, 64, 64, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 64, 64, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
{
ESP_LOGI(TAG, "dspi_dotprod_u8 128x128");
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 8x8
uint8_t result = -1;
unsigned int start_b = dsp_get_cpu_cycle_count();
dspi_dotprod_u8(&image1, &image2, &result, 16, 16, shift);
unsigned int end_b = dsp_get_cpu_cycle_count();
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
ESP_LOGI(TAG, "result 1 = %i", result);
uint8_t result_ref = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
ESP_LOGI(TAG, "result ref = %i", result_ref);
TEST_ASSERT_EQUAL( result, result_ref);
}
ESP_LOGI(TAG, "dspi_dotprod_u8 done");
free(x);
free(y);
free(z);
}

View File

@@ -0,0 +1,68 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string.h>
#include "unity.h"
#include "dsp_platform.h"
#include "esp_log.h"
#include "dspi_dotprod.h"
#include "dsp_tests.h"
static const char *TAG = "";
TEST_CASE("dspi_dotprod_u8_ansi functionality", "[dspi]")
{
uint8_t check_value1 = 67;
uint8_t check_value2 = 73;
int shift = 7;
int max_N = 1024;
uint8_t *x = (uint8_t *)malloc(max_N * sizeof(uint8_t));
uint8_t *y = (uint8_t *)malloc(max_N * sizeof(uint8_t));
uint8_t *z = (uint8_t *)malloc(max_N * sizeof(uint8_t));
for (size_t i = 0; i < 256; i++) {
x[i] = i % 8 + 20;
y[i] = i % 8 + 20;
z[i] = 0;
}
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
uint8_t result = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 1 = %i", result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[1];
image2.data = &y[1];
result = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 2 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
image1.data = &x[image1.stride_x];
image2.data = &y[image2.stride_x];
result = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 3 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value1);
image1.data = &x[image1.stride_x + 1];
image2.data = &y[image2.stride_x + 1];
result = -1;
dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
ESP_LOGI(TAG, "result 4 = %i", (int)result);
TEST_ASSERT_EQUAL( result, check_value2);
free(x);
free(y);
free(z);
}